library(Matrix)
library(cluster)
library(lattice)
library(plotrix)
library(gridExtra)
library(KernSmooth)
library(boot)
library(parallel)
library(texreg)
library(nnet)
library(xtable)
library(lpSolve)
library(knitr)
.default_par <- par(no.readonly=TRUE)
BASEDIR <- sprintf('%s/github/post-mortem-memory', Sys.getenv('HOME'))
DATADIR <- sprintf('%s/data/', BASEDIR)
PLOTDIR <- sprintf('%s/plots/', BASEDIR)
TABLEDIR <- sprintf('%s/tables/', BASEDIR)
fignum <- 0
tabnum <- 0
figcap <- function(increment) {
if (increment) fignum <<- fignum + 1
sprintf('**%sFigure %s:** ', if (SUPP_INFO) 'Supplementary ' else '', fignum)
}
tabcap <- function(increment) {
if (increment) tabnum <<- tabnum + 1
sprintf('**%sTable %s:** ', if (SUPP_INFO) 'Supplementary ' else '', tabnum)
}
vertskip <- '$$\\\\[4mm]$$'
# We aggregate chunk_size days; a value of 7 means we aggregate by weeks.
CHUNK_SIZE <- 1
# The number of time units ('chunks') before and after death.
N <- 360
COL <- list(NEWS=rgb(.9,.6,0), TWITTER=rgb(0,.45,.7))
COL_LIGHT <- list(NEWS=rgb(.9,.6,0,.3), TWITTER=rgb(0,.45,.7,.3))
COL_XLIGHT <- list(NEWS=rgb(.9,.6,0,.03), TWITTER=rgb(0,.45,.7,.03))
COL_LIGHTBLUE <- "#56B4E9"
COL_YELLOW <- "#F0E442"
COL_DARKBLUE <- "#0072B2"
COL_RED <- "#D55E00"
COL_MAGENTA <- "#CC79A7"
COL_GRAY <- "#999999"
COL_ORANGE <- "#E69F00"
COL_GREEN <- "#009E73"
ANGLO_COUNTRY_REGEX <- 'United States|United Kingdom|Canada|England|Australia|Scotland|Wales|Ireland|New Zealand|South Africa'
type_groups <- c('art', 'sports', 'leadership', 'known for death', 'general fame', 'academia/engineering')
split_at <- function(str, sep) strsplit(str, sep)[[1]]
fancy_mid <- function(mid) sub('<http://rdf.freebase.com/ns/m.(.*)>', '/m/\\1', mid)
long_mid <- function(mid) sub('/m/(.*)', '<http://rdf.freebase.com/ns/m.\\1>', mid)
fancy_wiki <- function(wiki) sub('<http://en.wikipedia.org/wiki/(.*)>', '\\1', wiki)
long_wiki <- function(wiki) sprintf('<http://en.wikipedia.org/wiki/%s>', wiki)
FANCY_CURVE_CHAR_NAMES <- c('Pre-mortem mean', 'Short-term boost', 'Long-term boost', 'Halving time')
names(FANCY_CURVE_CHAR_NAMES) <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
# A list of all days.
MIN_DATE <- as.Date("2008-08-01")
MAX_DATE <- as.Date("2014-09-30")
DAYS <- as.character(seq(MIN_DATE, MAX_DATE, by="+1 day"))
MAX_ABS_RELDATE <- as.numeric(MAX_DATE - MIN_DATE)
# Our Twitter data set is reasonably large only from 2009-06-11 on.
TWITTER_START_DATE <- as.Date("2009-06-11")
# We build a matrix x that has a row per person, and a column per day since death. There are
# nreldates = 2*(MAX_DATE-MIN_DATE)+1 such possible dates: the extreme cases are someone dying on
# MIN_DATE or MAX_DATE, so the relDates are [0 : MAX_DATE-MIN_DATE] in the former case, and
# [MIN_DATE-MAX_DATE : 0] in the latter case, for a total range of [MIN_DATE-MAX_DATE :
# MAX_DATE-MIN_DATE]; there are 2*(MAX_DATE-MIN_DATE)+1 values in this range.
# Values of relDate can be negative, zero, or positive. However, since R indices must be
# positive, we must translate relDates to get indices; the translation is as such:
# idx = relDate + MAX_DATE - MIN_DATE + 1
NRELDATES <- 2 * as.numeric(MAX_DATE - MIN_DATE) + 1
# The days on which our Spinn3r client must have been broken (there are still several tens of
# thousands of articles with dates from these days, but they were crawled on days outside of this
# window).
EMPTY_DAYS <- c(
"2009-05-19", "2009-07-15", "2010-04-05", "2010-04-20", "2010-04-21", "2010-04-22", "2010-04-23", "2010-04-24",
"2010-04-25", "2010-04-26", "2010-04-27", "2010-04-28", "2010-04-29", "2010-04-30", "2010-05-01", "2010-05-02",
"2010-05-03", "2010-05-04", "2010-05-05", "2010-05-06", "2010-05-07", "2010-05-08", "2010-05-09", "2010-05-10",
"2010-05-11", "2010-05-12", "2010-05-13", "2010-05-16", "2010-05-17", "2010-05-18", "2010-05-19", "2010-05-20",
"2010-05-21", "2010-05-22", "2010-05-23", "2010-05-24", "2010-05-25", "2010-05-26", "2010-05-27", "2010-05-28",
"2010-05-29", "2010-05-30", "2010-05-31", "2010-06-01", "2010-06-02", "2010-06-03", "2010-06-04", "2010-06-05",
"2010-06-06", "2010-06-07", "2010-06-08", "2010-06-09", "2010-06-10", "2010-06-11", "2010-06-12", "2010-06-13",
"2010-06-14", "2010-06-15", "2010-06-16", "2010-06-17", "2010-06-18", "2010-06-19", "2010-06-20", "2010-06-21",
"2010-06-22", "2010-06-23", "2010-06-24", "2010-06-25", "2010-06-26", "2010-06-27", "2010-06-28", "2010-06-29",
"2010-06-30", "2010-07-01", "2010-07-02", "2010-07-03", "2010-07-04", "2010-07-05", "2010-07-06", "2010-07-07",
"2010-07-08", "2010-07-09", "2010-07-10", "2010-07-11", "2010-07-12", "2010-07-13", "2011-12-03")
# The dates of death.
death_dates_tbl <- read.table(pipe(sprintf('gunzip -c %s/date_of_death.nt', DATADIR)),
comment.char='', sep='\t', quote='',
col.names=c('mid', '_rel', 'date', '_period'))[,c('mid', 'date')]
death_dates_tbl <- death_dates_tbl[grepl('#date>$', death_dates_tbl$date),]
death_dates_tbl$date <- substring(death_dates_tbl$date, 2, 11)
death_dates <- death_dates_tbl$date
names(death_dates) <- death_dates_tbl$mid
# Our Twitter data set is reasonably large only from 2009-06-11 on.
# Get the mids of the people that died between then and MAX_DATE (Sept. 2014).
died_in_window <- names(death_dates[as.Date(death_dates) >= TWITTER_START_DATE
& as.Date(death_dates) <= MAX_DATE])
# A mapping from mids to Wikipedia names.
wiki_tbl <- read.table(pipe(sprintf('gunzip -c %s/names.DEAD.UNAMBIGUOUS.tsv.gz', DATADIR)),
comment.char='', sep='\t', quote='',
col.names=c('mid', 'names', 'aliases', 'curid', 'wiki'),
stringsAsFactors=FALSE)
wiki_tbl$wiki <- sapply(wiki_tbl$wiki, function(x) split_at(x, '\\|')[1])
mid_to_wiki <- wiki_tbl$wiki
names(mid_to_wiki) <- wiki_tbl$mid
mid_to_wiki <- mid_to_wiki[!is.na(mid_to_wiki)]
# A mapping from mids to names.
wiki_to_mid <- names(mid_to_wiki)
names(wiki_to_mid) <- mid_to_wiki
# Properties of dead people.
props <- read.table(pipe(sprintf('gunzip -c %s/dead_people_properties.tsv.gz', DATADIR)),
comment.char='', sep='\t', quote='', header=TRUE, fill=TRUE,
col.names=c('person', 'cause_of_death', 'date_of_death', 'place_of_death', 'date_of_burial',
'place_of_burial', 'date_of_cremation', 'place_of_cremation', 'date_of_birth',
'place_of_birth', 'nationality', 'profession', 'religion', 'ethnicity',
'notable_types', 'gender'))
props$mid <- sub('(<.*>).*', '\\1', props$person)
props$gender <- as.factor(sub('<.*>(.*)', '\\1', props$gender))
death_years <- sub('"(....).*', '\\1', props$date_of_death)
birth_years <- sub('"(....).*', '\\1', props$date_of_birth)
death_years[!grepl("....", death_years)] <- NA
birth_years[!grepl("....", birth_years)] <- NA
props$age <- as.numeric(death_years) - as.numeric(birth_years)
rownames(props) <- props$mid
# Load the taxonomies.
tax_causes <- read.table(sprintf('%s/taxonomy_causes_of_death.tsv', DATADIR),
header=TRUE, sep='\t', comment.char='', quote='')
rownames(tax_causes) <- paste(tax_causes$mid, tax_causes$cause_of_death, sep='')
tax_types <- read.table(sprintf('%s/taxonomy_notable_types.tsv', DATADIR),
header=TRUE, sep='\t', comment.char='', quote='')
rownames(tax_types) <- paste(tax_types$mid, tax_types$notable_type, sep='')
# Some causes of death are missing from Janice's table. Add them manually.
natural_manual <- "Viral pneumonia|Smallpox|Dementia with Lewy bodies|Heart valve disease|Creutzfeldt–Jakob disease|T-Cell Lymphoma|Adrenocortical carcinoma|Huntington's disease|Congenital heart defect|Squamous-cell carcinoma|Atypical teratoid rhabdoid tumor|Alveolar rhabdomyosarcoma|Appendix cancer|Pyelonephritis|Polymyalgia rheumatica|Polycythemia|Leiomyosarcoma|Astrocytoma"
unnatural_manual <- "Smoke inhalation|Racing Accident|Lightning|Casualty of war|Cocaine overdose|Poisoning|Shootout|Murder–suicide|Accidental drug overdose|Blast injury"
# Create the regexes for (un)natural deaths.
natural_regex <- sprintf(">(%s|%s)($|\\|)",
paste(tax_causes$cause.of.death[tax_causes$level1=='natural'],
collapse='|'), natural_manual)
unnatural_regex <- sprintf(">(%s|%s)($|\\|)",
paste(tax_causes$cause.of.death[tax_causes$level1=='unnatural'],
collapse='|'), unnatural_manual)
# -1 = unnatural, 0 = unknown/conflicting, 1 = natural.
get_cause_of_death_map <- function() {
natural_death_mids <- props$mid[which(grepl(natural_regex, props$cause_of_death))]
unnatural_death_mids <- props$mid[which(grepl(unnatural_regex, props$cause_of_death))]
map <- (props$mid %in% natural_death_mids) - (props$mid %in% unnatural_death_mids)
# If a person has a natural and an unnatural cause, we want to list them under unnatural death.
map[props$mid %in% natural_death_mids & props$mid %in% unnatural_death_mids] <- -1
names(map) <- props$mid
return(map)
}
# e.g., <http://rdf.freebase.com/ns/m.025698c>Baseball Player --> <http://rdf.freebase.com/ns/m.025698c>
strip_plaintext_name_from_mid <- function(mid_with_name) {
sub('(<.*>).*', '\\1', mid_with_name)
}
# tax must be either tax_types or tax_causes;
# data must be a vector with taxonomy entries as values and mids as names.
select_from_tax <- function(level, value, tax, data) {
col <- sprintf('level%d', level)
ok_values <- rownames(tax)[tax[,col] == value]
names(data)[data %in% ok_values]
}
get_num_art <- function(medium) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
num_art_tbl <- read.table(sprintf('%s/num_articles_per_day_%s.tsv', DATADIR, medium),
col.names=c('date', 'num'))
num_art <- num_art_tbl$num
names(num_art) <- num_art_tbl$date
num_art <- num_art[names(num_art) >= as.character(MIN_DATE) & names(num_art) <= as.character(MAX_DATE)]
num_art[setdiff(DAYS, names(num_art))] <- 0
num_art <- num_art[order(names(num_art))]
num_art[EMPTY_DAYS] <- NA
return(num_art)
}
get_mention_freq_table <- function(medium) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
file <- sprintf('%s/RData/dead_people_mentions_%s.RData', DATADIR, medium)
if (!file.exists(file)) {
data <- read.table(pipe(sprintf('gunzip -c %s/num_dead_mentions_per_day_%s.tsv.gz', DATADIR, medium)),
comment.char='', sep='\t', quote='',
col.names=c('mid', 'date', 'num_per_doc', 'num_doc'),
colClasses=c('character', 'character', 'numeric', 'numeric'))
# Add a column having the number of days since death.
data$rel_date <- as.numeric(as.Date(data$date) - as.Date(death_dates[data$mid]))
save(data, file=file)
} else {
load(file)
}
return(data)
}
get_rel_date_matrix <- function(medium, data, num_art, chunk_size) {
if (medium == 'NEWS') {
min_num_per_doc <- 2
} else if (medium == 'TWITTER') {
min_num_per_doc <- 1
} else {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
chunks <- floor(((1:NRELDATES)-(NRELDATES+1)/2)/chunk_size)
sum.na.rm <- function(x) sum(x, na.rm=TRUE)
idx <- which(data$num_per_doc >= min_num_per_doc)
file <- sprintf('%s/RData/num_mentions_per_rel_date_%s_min_num_per_doc=%s_chunk_size=%s.RData',
DATADIR, medium, min_num_per_doc, chunk_size)
if (!file.exists(file)) {
x <- do.call(rbind, mclapply(split(data[idx,], data$mid[idx]), function(l) {
dod <- as.Date(death_dates[l$mid[1]])
# Aggregate the counts of docs containing 1, 2, >=3 mentions.
l <- data.frame(t(simplify2array(by(l, l$date, function(X) c(
as.numeric(as.character(X$rel_date[1])),
as.numeric(as.character(sum(X$num_doc))))))))
colnames(l) <- c('rel_date', 'num_doc')
l$date <- rownames(l)
# Initialize y, the count vector for the current mid.
# y has 2*(MAX_DATE-MIN_DATE)+1 entries, of which only MAX_DATE-MIN_DATE+1 are well-defined (as per
# our discussion of NRELDATES above).
# For the MAX_DATE-MIN_DATE days for which we have no data, the values are set to NA.
# We also set to NA the values for the days that have no Spinn3r data (primarily the 2010 hole).
# For the remaining MAX_DATE-MIN_DATE+1 days, we initialize values to 0.
offset <- as.numeric(MAX_DATE-MIN_DATE) + 1
y <- rep(NA, NRELDATES)
y[as.numeric(as.Date(DAYS)-dod) + offset] <- 0
y[l$rel_date + offset] <- l$num_doc
y[as.numeric(as.Date(EMPTY_DAYS)-dod) + offset] <- NA
# n: the number of docs per day aligned in terms of relative dates.
n <- rep(0, NRELDATES)
n[as.numeric(as.Date(DAYS)-dod) + offset] <- num_art
# Sum within chunks.
s <- tapply(y, chunks, sum.na.rm) / tapply(n, chunks, sum.na.rm)
# Set to NA the days before TWITTER_START_DATE in the case of Twitter.
if (medium == 'TWITTER') {
s[as.numeric(as.Date(DAYS[as.Date(DAYS) < TWITTER_START_DATE])-dod) + offset] <- NA
}
return(s)
}, mc.cores=6))
x <- as.matrix(x)
save(x, file=file)
} else {
load(file)
}
return(x)
}
filter_people <- function(medium, x) {
N_immed_after <- 100
num_finite_before <- rowSums(is.finite(x[,colnames(x) %in% -N:-1]))
num_finite_immed_after <- rowSums(is.finite(x[,colnames(x) %in% 0:(N_immed_after-1)]))
num_active_before <- rowSums(x[,colnames(x) %in% -N:-1] > 0, na.rm=TRUE)
mids <- names(which(
# Keep only people whose boundaries aren't missing.
is.finite(x[,colnames(x)==N]) & is.finite(x[,colnames(x)==-N])
# Keep only people that have no missing data in the N_immed_after days immediately after death.
& num_finite_immed_after == N_immed_after
# Keep only people that were mentioned on at least 5 days before they died.
& num_active_before >= 5
# Keep only people that died after the point from which on we have a lot of Twitter data.
& rownames(x) %in% died_in_window
# Discard people with parentheses in their names because those names, although unique in
# Wikipedia, are unlikely to ever be used in real prose.
& !grepl('\\(', mid_to_wiki[rownames(x)])))
return(mids)
}
# Apply Friedman's super smoother, hell yeah.
supersmooth <- function(y) {
reldates <- as.numeric(names(y))
suppressWarnings({
smoothed_left <- supsmu(reldates[reldates<0], y[reldates<0])
smoothed_right <- supsmu(reldates[reldates>=0], y[reldates>=0])
})
smoothed <- c(smoothed_left$y, smoothed_right$y)
names(smoothed) <- c(smoothed_left$x, smoothed_right$x)
return(smoothed)
}
normalize_and_smooth <- function(medium, x, num_art, mean_center=TRUE) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
# Smoothing term: we add 1 mention (as computed on the highest-volume day) to each day.
eps <- 1 / max(num_art, na.rm=TRUE)
file <- sprintf('%s/RData/clustering_input_%s%s.RData', DATADIR, medium, if (mean_center) '_MEANCENTERED' else '')
if (!file.exists(file)) {
# Select the subset of reldates; keep a month of padding, so smoothing is more robust at the
# boundaries,
X <- x[,colnames(x) %in% -(N+30):(N+30)]
reldates <- as.numeric(colnames(X))
# Add-eps smoothing.
X <- X + eps
# Normalize and smooth all time series.
X <- t(apply(X, 1, function(y_unnorm) {
# Log10-transform.
y <- log10(y_unnorm)
# Smooth.
y <- supersmooth(y)
# Subtract the pre-mortem mean. Now we have log10(y_unnorm/mean*(y_unnorm)), where mean* is the
# exponential of the mean in log space (i.e., a more robust version of the mean).
# In the mean computation we exclude the 30 days immediately before death, to mitigate the
# impact of the final sick days for people whose death was foreseeable.
if (mean_center) y <- y - mean(y[reldates %in% -N:-30], na.rm=TRUE)
# Select the subset of reldates.
y <- y[names(y) %in% -N:N]
# Interpolate missing values; rule=2 means that at the boundaries the values at the boundaries are
# imputed for missing values. (But since we require our time series not to end in a missing value,
# this shouldn't happen in the post-mortem period.)
y <- approx(names(y), y, -N:N, rule=2)$y
return(y)
}))
colnames(X) <- -N:N
save(X, file=file)
} else {
load(file)
}
return(X)
}
# Normalize the time series to [0,1] and measure the fraction of time it takes until half of the
# total post-mortem volume has been seen. The simplest version maps the min to 0, but this is quite
# sensitive to low outliers, so we also allow for versions where we first set to 0 all values
# less than the specified quantile.
time_till_half <- function(y, thresh_quantile=0) {
# Start with the day after death, since the news might not be reporting yet on the day of.
y <- y[as.numeric(names(y)) >= 1]
y <- pmax(y, quantile(y, thresh_quantile))
y <- (y - min(y)) / (max(y) - min(y))
min(which(cumsum(y) >= 0.5*sum(y))) / length(y)
}
compute_curve_stats <- function(medium, x, X, num_art) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
file <- sprintf('%s/mention_freq_for_spreadsheet_%s.csv', DATADIR, medium)
if (!file.exists(file)) {
# Find the people that died (un)natural deaths.
mids <- rownames(x)
mid_nat <- intersect(mids, names(which(get_cause_of_death_map()==1)))
mid_unnat <- intersect(mids, names(which(get_cause_of_death_map()==-1)))
# The number of days we consider before and after death.
subrange_before <- -N:-30
subrange_peak <- 0:29
subrange_after <- 30:N
x <- x[,colnames(x) %in% -N:N]
num_active <- rowSums(x > 0, na.rm=TRUE)
num_active_before <- rowSums(x[,colnames(x) %in% -N:-1] > 0, na.rm=TRUE)
num_active_after <- rowSums(x[,colnames(x) %in% 0:N] > 0, na.rm=TRUE)
num_finite_before <- rowSums(is.finite(x[,colnames(x) %in% -N:-1]))
num_finite_after <- rowSums(is.finite(x[,colnames(x) %in% 0:N]))
stats <- data.frame(mid=NA,
name=NA,
image=NA,
num_active_days=NA,
active_fraction_before=NA,
active_fraction_after=NA,
max_smoothed_before=NA,
max_raw_before=NA,
mean_before=NA,
mean_after=NA,
mean_after_peak=NA,
peak_smoothed=NA,
peak_raw=NA,
peak_raw_day=NA,
time_till_half=NA,
time_till_half_thresh=NA,
natural_death=NA,
unnatural_death=NA,
gender=NA,
cause_of_death=NA,
age=NA,
notable_type=NA)[-1,]
for (i in 1:nrow(X)) {
if (i %% 100 == 0) print(i)
mid <- rownames(X)[i]
name <- fancy_wiki(mid_to_wiki[mid])
img_name <- if (!is.na(name)) gsub('(%(25)?..)+', '+', name) else sprintf('NA_%s', gsub('.*/m\\.(.*)>', '\\1', mid))
raw <- log10(x[mid,])
smoothed <- X[mid,]
mean_before <- mean(smoothed[names(smoothed) %in% subrange_before], na.rm=TRUE)
peak_smoothed <- max(smoothed[names(smoothed) %in% subrange_peak])
peak_raw <- max(raw[names(raw) %in% subrange_peak])
# -1 such that 0 represents day of death.
peak_raw_day <- which.max(raw[names(raw) %in% subrange_peak]) - 1
stats[mid,] <- c(
fancy_mid(mid),
name,
sprintf('=image("http://infolab.stanford.edu/~west1/death/mention_freq_curves/%s/%s.png")',
medium, img_name),
num_active[mid],
num_active_before[mid]/num_finite_before[mid],
num_active_after[mid]/num_finite_after[mid],
max(smoothed[names(smoothed) %in% subrange_before], na.rm=TRUE),
max(raw[names(raw) %in% subrange_before], na.rm=TRUE),
mean_before,
mean(smoothed[names(smoothed) %in% c(subrange_peak, subrange_after)]),
mean(smoothed[names(smoothed) %in% subrange_after]),
peak_smoothed,
peak_raw,
peak_raw_day,
time_till_half(smoothed),
time_till_half(smoothed, 0.25),
if (mid %in% mid_nat) 1 else 0,
if (mid %in% mid_unnat) 1 else 0,
as.character(props[mid, 'gender']),
as.character(props[mid, 'cause_of_death']),
props[mid, 'age'],
as.character(props[mid, 'notable_types'])
)
}
# Write the data into a CSV file.
write.table(stats, file, quote=FALSE, sep='\t', row.names=FALSE)
} else {
stats <- read.table(file, header=TRUE, sep='\t', comment.char='', quote='')
}
return(stats)
}
load_stats <- function(medium, x, X, num_art) {
if (medium != 'NEWS' && medium != 'TWITTER') {
stop('Medium must be \'NEWS\' or \'TWITTER\'')
}
stats <- compute_curve_stats(medium, x, X, num_art)
rownames(stats) <- long_mid(stats$mid)
# If the person has 0 mentions on the day of death, the log will be -Inf.
# Replace those by the smallest finite number.
stats$peak_raw[!is.finite(stats$peak_raw)] <- min(stats$peak_raw[is.finite(stats$peak_raw)])
# Since the quantities are logarithmic (log10), taking the difference is equivalent to the log10
# of the ratio.
stats$peak_mean_boost <- stats$peak_raw - stats$mean_before
stats$peak_max_boost <- stats$peak_raw - stats$max_raw_before
stats$perm_boost <- stats$mean_after_peak - stats$mean_before
stats$perm_boost_all <- stats$mean_after - stats$mean_before
# Some rank transforms.
r <- function(x) {
n <- length(x)
# Keys: indices in x; values: ranks.
rank_map <- 1:n
names(rank_map) <- order(x)
rank_map[as.character(1:n)]
}
stats$mean_before_rank <- r(stats$mean_before)
stats$mean_after_rank <- r(stats$mean_after)
stats$peak_mean_boost_rank <- r(stats$peak_mean_boost)
stats$perm_boost_rank <- r(stats$perm_boost)
stats$perm_boost_all_rank <- r(stats$perm_boost_all)
stats$time_till_half_rank <- r(stats$time_till_half)
# Add a categorical death-type column.
stats$death_type <- 'unknown'
stats$death_type[stats$natural_death==1] <- 'natural'
stats$death_type[stats$unnatural_death==1] <- 'unnatural'
stats$death_type <- as.factor(stats$death_type)
# Add age groups.
stats$age_group <- floor(stats$age/10)*10
# Add anglo flag.
country_known <- props$mid[props$nationality!='']
anglos <- props$mid[grepl(ANGLO_COUNTRY_REGEX, props$nationality)]
stats$anglo <- 'unknown'
stats$anglo[rownames(stats) %in% country_known] <- 'non_anglo'
stats$anglo[rownames(stats) %in% anglos] <- 'anglo'
stats$anglo <- as.factor(stats$anglo)
# Notable types for all people.
types <- tax_types[strip_plaintext_name_from_mid(props$notable_types), 'level1']
names(types) <- props$mid
stats$type_group <- types[rownames(stats)]
return(stats)
}
# The stats of all people, rather than just the couple of thousands included in our study.
load_stats_all <- function() {
stats_all <- props[props$mid %in% died_in_window, c('gender', 'age', 'cause_of_death', 'nationality', 'notable_types')]
stats_all$death_type <- 'unknown'
stats_all$death_type[rownames(stats_all) %in% names(which(get_cause_of_death_map()==1))] <- 'natural'
stats_all$death_type[rownames(stats_all) %in% names(which(get_cause_of_death_map()==-1))] <- 'unnatural'
stats_all$death_type <- as.factor(stats_all$death_type)
country_known <- rownames(stats_all)[stats_all$nationality!='']
anglos <- rownames(stats_all)[grepl(ANGLO_COUNTRY_REGEX, stats_all$nationality)]
stats_all$anglo <- 'unknown'
stats_all$anglo[rownames(stats_all) %in% country_known] <- 'non_anglo'
stats_all$anglo[rownames(stats_all) %in% anglos] <- 'anglo'
stats_all$anglo <- as.factor(stats_all$anglo)
types <- tax_types[strip_plaintext_name_from_mid(stats_all$notable_types), 'level1']
names(types) <- rownames(stats_all)
stats_all$type_group <- types[rownames(stats_all)]
return(stats_all)
}
make_bio_table <- function() {
to_percentage <- function(x) sprintf('%.0f%%', x*100)
summary_table_all <- NULL
summary_table_filtered <- NULL
summary_table_lm <- NULL
### Age
summary_table_filtered['Age'] <- ''
summary_table_all['Age'] <- ''
summary_table_lm['Age'] <- ''
summary_table_filtered['Age N/A'] <- to_percentage(mean(is.na(stats_N$age)))
summary_table_filtered[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(stats_N$age)[c(2,4,3,5)])
summary_table_all['Age N/A'] <- to_percentage(mean(is.na(stats_all$age)))
summary_table_all[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(stats_all$age[stats_all$age <= 120])[c(2,4,3,5)])
summary_table_lm['Age N/A'] <- to_percentage(mean(is.na(lmdata_N$age)))
summary_table_lm[c('1st quartile', 'Mean', 'Median', '3rd quartile')] <- sprintf('%.0f', summary(lmdata_N$age)[c(2,4,3,5)])
### Gender
summary_table_filtered['Gender'] <- ''
summary_table_all['Gender'] <- ''
summary_table_lm['Gender'] <- ''
idx <- stats_N$gender %in% c('Female', 'Male')
summary_table_filtered['Gender N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[c('Female', 'Male')] <- to_percentage((summary(stats_N$gender[idx]) / sum(summary(stats_N$gender[idx])))[c('Female', 'Male')])
idx <- stats_all$gender %in% c('Female', 'Male')
summary_table_all['Gender N/A'] <- to_percentage(mean(!idx))
summary_table_all[c('Female', 'Male')] <- to_percentage((summary(stats_all$gender[idx]) / sum(summary(stats_all$gender[idx])))[c('Female', 'Male')])
idx <- lmdata_N$gender %in% c('Female', 'Male')
summary_table_lm['Gender N/A'] <- to_percentage(mean(!idx))
summary_table_lm[c('Female', 'Male')] <- to_percentage((summary(lmdata_N$gender[idx]) / sum(summary(lmdata_N$gender[idx])))[c('Female', 'Male')])
### Manner of death
summary_table_filtered['Manner of death'] <- ''
summary_table_all['Manner of death'] <- ''
summary_table_lm['Manner of death'] <- ''
idx <- stats_N$death_type != 'unknown'
summary_table_filtered['Manner of death N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[c('Natural', 'Unnatural')] <-
to_percentage((summary(stats_N$death_type[idx]) / sum(summary(stats_N$death_type[idx])))[c('natural', 'unnatural')])
idx <- stats_all$death_type != 'unknown'
summary_table_all['Manner of death N/A'] <- to_percentage(mean(!idx))
summary_table_all[c('Natural', 'Unnatural')] <-
to_percentage((summary(stats_all$death_type[idx]) / sum(summary(stats_all$death_type[idx])))[c('natural', 'unnatural')])
idx <- lmdata_N$death_type != 'unknown'
summary_table_lm['Manner of death N/A'] <- to_percentage(mean(!idx))
summary_table_lm[c('Natural', 'Unnatural')] <-
to_percentage((summary(lmdata_N$death_type[idx]) / sum(summary(lmdata_N$death_type[idx])))[c('natural', 'unnatural')])
### Language
summary_table_filtered['Language'] <- ''
summary_table_all['Language'] <- ''
summary_table_lm['Language'] <- ''
idx <- stats_N$anglo != 'unknown'
summary_table_filtered['Language N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(stats_N$anglo[idx]) / sum(summary(stats_N$anglo[idx])))[c('anglo', 'non_anglo')])
idx <- stats_all$anglo != 'unknown'
summary_table_all['Language N/A'] <- to_percentage(mean(!idx))
summary_table_all[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(stats_all$anglo[idx]) / sum(summary(stats_all$anglo[idx])))[c('anglo', 'non_anglo')])
idx <- lmdata_N$anglo != 'unknown'
summary_table_lm['Language N/A'] <- to_percentage(mean(!idx))
summary_table_lm[c('Anglophone', 'Non-anglophone')] <- to_percentage((summary(lmdata_N$anglo[idx]) / sum(summary(lmdata_N$anglo[idx])))[c('anglo', 'non_anglo')])
### Notable type
summary_table_filtered['Notability type'] <- ''
summary_table_all['Notability type'] <- ''
summary_table_lm['Notability type'] <- ''
idx <- !is.na(stats_N$type_group)
summ <- summary(stats_N$type_group[idx])[type_groups]
summary_table_filtered['Notability type N/A'] <- to_percentage(mean(!idx))
summary_table_filtered[names(summ)] <- to_percentage(summ / sum(summ))
idx <- !is.na(stats_all$type_group)
summ <- summary(stats_all$type_group[idx])[type_groups]
summary_table_all['Notability type N/A'] <- to_percentage(mean(!idx))
summary_table_all[names(summ)] <- to_percentage(summ / sum(summ))
idx <- !is.na(lmdata_N$type_group)
summ <- summary(lmdata_N$type_group[idx])[type_groups]
summary_table_lm['Notability type N/A'] <- to_percentage(mean(!idx))
summary_table_lm[names(summ)] <- to_percentage(summ / sum(summ))
summary_table_filtered['Count'] <- nrow(stats_N)
summary_table_all['Count'] <- nrow(stats_all)
summary_table_lm['Count'] <- nrow(lmdata_N)
# Return a table with stats for both "all" and "filtered" (i.e., those included in the study).
data.frame(All=summary_table_all, Included=summary_table_filtered, Regression=summary_table_lm)
}
plot_num_art <- function(num_art, medium) {
par(mar=c(6,5,2,2))
idx <- which(names(num_art)==TWITTER_START_DATE):which(names(num_art)==MAX_DATE)
plot(num_art[idx], axes=FALSE, log='y', type='p', xlab='', ylab='Number of documents',
col=COL[[medium]], main=medium, ylim=c(5e4,1e8))
axis(2)
days <- DAYS[idx]
ticks <- which(grepl('-01$', days))
axis(1, at=ticks, labels=rep('', length(ticks)), col.ticks='gray')
ticks <- which(grepl('-01-01$', days))
axis(1, at=ticks, labels=days[ticks], las=2)
# par(.default_par)
}
plot_time_series_for_small_multiples <- function(name, medium, X, x, num_art,
xlab=TRUE, ylab=TRUE) {
if (SAVE_PLOTS) pdf(sprintf('%s/mention_curve_%s_%s.pdf', PLOTDIR, name, medium), width=2, height=2,
pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mar=c(3,3,1,1))
mid <- wiki_to_mid[sprintf('<http://en.wikipedia.org/wiki/%s>', name)]
ymin <- log10(1 / max(num_art, na.rm=TRUE))
raw <- log10(x[mid, colnames(x) %in% -100:360])
smoothed <- X[mid, colnames(X) %in% -100:360]
plot(-100:360, raw, col=COL_LIGHT[[medium]],
bty='n', xlab='', xlim=c(-100,360), ylim=c(ymin,-1), ylab='',
panel.first=abline(v=0, col='gray'))
text(360, -1, sprintf('%s', gsub('_', ' ', name)), adj=c(1,1), cex=1)
lines(names(smoothed), smoothed, col=COL[[medium]], lwd=2)
if (xlab) mtext('Days since death', side=1, line=2)
if (ylab) mtext('Fraction mentioning docs [log10]', side=2, line=2)
if (SAVE_PLOTS) dev.off()
}
plot_max_mem_hist <- function(medium) {
if (SAVE_PLOTS) cairo_pdf(sprintf('%s/max_mem_hist_%s.pdf', PLOTDIR, medium), width=3.4, height=2.4,
pointsize=8, family='Helvetica')
par(mar=c(3,3,1,3))
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
t_max <- 100
max_days <- apply(x[,colnames(x) %in% 0:t_max], 1, function(x) which.max(x) - 1)
h <- hist(max_days, breaks=0:(t_max+1), right=FALSE, include.lowest=FALSE, xlab='', ylab='',
main=NULL, axes=FALSE, col=COL[[medium]], border=NA, xlim=c(0,t_max+1), ylim=c(0,800))
cum <- cumsum(h$counts)/sum(h$counts)
ticks <- seq(0, t_max, 10)
axis(1, at=ticks+0.5, labels=ticks, las=2)
mtext('Days since death', side=1, line=2)
axis(2, col=COL[[medium]], col.axis=COL[[medium]])
mtext('Number of people', side=2, line=2, col=COL[[medium]])
par(new=TRUE)
plot((0:t_max)+0.5, cum, type='p', xlab='', ylab='', xlim=c(0,t_max+1), ylim=c(0,1), axes=FALSE,
panel.first=abline(h=cum[30], v=29.5, col='gray', lty=2))
axis(4)
mtext('Cumulative rel. frequency', side=4, line=2)
text(29.5, cum[30], labels=sprintf('%.1f%% had maximum mention\n frequency by day 29', 100*cum[30]),
adj=c(-0.1,1.5), col='gray')
if (SAVE_PLOTS) dev.off()
return(list(hist=h, max_days=max_days))
}
make_df_for_fit <- function(medium) {
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
t <- 1:400
eps <- min(x[x>0], na.rm=TRUE)
y <- 10^colMeans(log10(x[,colnames(x) %in% t] + eps), na.rm=TRUE)
df <- data.frame(y=y, t=t)
return(df)
}
biexp_fit <- function(medium) {
df <- make_df_for_fit(medium)
# First approximation: r = 0.
# log(y) ~ log(N*exp(-p*t)) = log(N)-p*t
model0 <- lm(log(y) ~ t, data=df)
N0 <- exp(coef(model0)[1])
p0 <- -coef(model0)[2]
# Second approximation: q = 0.
model1 <- nls(log(y) ~ log( N/(p+r) * (p*exp(-(p+r)*t) + r) ),
start=list(p=p0, N=N0, r=0),
data=df, control=list(maxiter=1e8), algorithm='port')
N1 <- coef(model1)['N']
p1 <- coef(model1)['p']
r1 <- coef(model1)['r']
# Full model.
model <- nls(log(y) ~ log( N/(p-q+r) * ((p-q)*exp(-(p+r)*t) + r*exp(-q*t)) ),
start=list(p=p1, N=N1, r=r1, q=0),
data=df, control=list(maxiter=1e8), algorithm='port', lower=c(q=0))
return(model)
}
exp_fit <- function(medium) {
df <- make_df_for_fit(medium)
# log(y) ~ log(a*exp(-b*t)) = log(a)-b*t
model <- lm(log(y) ~ t, data=df)
return(model)
}
exp_power_fit <- function(medium) {
df <- make_df_for_fit(medium)
# Start with a good guess.
# log(y) ~ log(a*exp(b*t^(-1))) = log(a)+b*t^(-1)
c0 <- -1
model0 <- lm(log(y) ~ I(t^c0), data=df)
log_a0 <- coef(model0)[1]
b0 <- coef(model0)[2]
model <- nls(log(y) ~ log_a + b*t^c,
start=list(log_a=log_a0, b=b0, c=c0),
data=df, control=list(maxiter=1e8), algorithm='port')
return(model)
}
lognorm_fit <- function(medium) {
df <- make_df_for_fit(medium)
# log(y) ~ a + b*log(t) + c*log(t)^2
model <- lm(log(y) ~ log(t) + I(log(t)^2), data=df)
return(model)
}
lognorm_constrained_fit <- function(medium) {
df <- make_df_for_fit(medium)
df$combo <- log(df$t)^2 - 2*log(max(df$t))*log(df$t)
# Requiring the vertex to be at the maximum time value introduces the constraint
# b = -2c*log(t_max), which reduces the problem to the following:
# log(y) ~ a + c*(log(t)^2 - 2*log(t_max)*log(t))
model <- lm(log(y) ~ combo, data=df)
return(model)
}
power_fit <- function(medium) {
df <- make_df_for_fit(medium)
# log(y) ~ a + b*log(t)
model <- lm(log(y) ~ log(t), data=df)
return(model)
}
log_fit <- function(medium) {
df <- make_df_for_fit(medium)
# y ~ a + b*log(t)
model0 <- lm(y ~ log(t), data=df)
a0 <- coef(model0)[1]
b0 <- coef(model0)[2]
# Necessary to avoid negative argument to log in nls fit (for TWITTER only).
if (a0 + b0*log(max(df$t)) <= 0) a0 <- -b0*log(max(df$t)) + 1e-8
# log(y) ~ log(a + b*log(t))
model <- nls(log(y) ~ log(a + b*log(t)),
# Without "/2", we fail to find an optimum in the case of TWITTER.
start=list(a=a0/2, b=b0/2),
data=df, control=list(maxiter=1e8))
return(model)
}
hyperbolic_fit <- function(medium) {
df <- make_df_for_fit(medium)
# 1/y ~ a + b*t
model0 <- lm(1/y ~ t, data=df)
a0 <- coef(model0)[1]
b0 <- coef(model0)[2]
# log(y) ~ -log(a + b*t)
model <- nls(log(y) ~ -log(a + b*t),
start=list(a=a0, b=b0),
data=df, control=list(maxiter=1e8), algorithm='port')
return(model)
}
hyperbolic_power_fit <- function(medium) {
df <- make_df_for_fit(medium)
# Start with a good guess.
# 1/y ~ a + b*t^c
c0 <- -7
model0 <- lm(1/y ~ I(t^c0), data=df)
a0 <- coef(model0)[1]
b0 <- coef(model0)[2]
# log(y) ~ -log(a + b*t^c)
model <- nls(log(y) ~ -log(a + b*t^c),
start=list(a=a0, b=b0, c=c0),
data=df, control=list(maxiter=1e9, warnOnly=TRUE), algorithm='port', trace=FALSE,
lower=c(a=0, b=-Inf, c=-Inf), upper=c(a=Inf, b=0, c=0))
return(model)
}
plot_avg_fraction_of_mentioning_docs <- function(medium, biexp_model=NULL) {
if (SAVE_PLOTS) pdf(sprintf('%s/avg_mention_curve_%s.pdf', PLOTDIR, medium), width=3.4, height=3.4,
pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mar=c(3,3,1,1))
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
max_day <- 400
days <- -100:max_day
eps <- min(x[x>0], na.rm=TRUE)
# Log.
y <- colMeans(log10(x[,colnames(x) %in% days] + eps), na.rm=TRUE)
par(fig=c(0, 1, 0, 1))
plot(days, y, type='p', bty='n', lwd=2, xlab='', ylab='', #xaxt='n', yaxt='n',
panel.first=abline(v=0, col='gray', lwd=1, lty=2), col=COL[[medium]], las=0)
mtext('Days since death', side=1, line=2)
mtext('Fraction mentioning documents [log10]', side=2, line=2)
y365 <- y['365']
arrows(x0=325, x1=355, y0=y365, y1=y365, length=0.05)
text(325, y365, '365 days', adj=c(1.1, 0.5))
if (!is.null(biexp_model)) {
biexp_coefs <- coef(biexp_model)
N <- biexp_coefs['N']
p <- biexp_coefs['p']
r <- biexp_coefs['r']
q <- biexp_coefs['q']
t <- 1:max_day
mem_comm <- N * exp(-(p+r)*t)
mem_cult <- N/(p-q+r)*r * (exp(-q*t) - exp(-(p+r)*t))
mem <- mem_comm + mem_cult
lines(t, log10(mem_comm), lty=1, lwd=2, col=COL_GREEN)
lines(t, log10(mem_cult), lty=1, lwd=2, col=COL_RED)
lines(t, log10(mem), lty=1, lwd=2, col='black')
legend('topright', bty='n',
legend=c('Daily average', 'Biexponential fit S(t) = u(t) + v(t)',
'Communicative memory u(t)', 'Cultural memory v(t)'),
seg.len=3,
pch=c(1,NA,NA,NA),
lty=c(0,1,1,1), #lty=c(0,1,4,2),
col=c(COL[[medium]], 'black', COL_GREEN, COL_RED),
lwd=c(2, 2, 2, 2))
# Log-log.
par(fig=c(0.35, 0.9, 0.27, 0.77), new=TRUE)
y <- y[names(y) %in% t]
# Here the day of death is shown as day 1 (due to log axes).
plot(t, y, type='p', bty='n', xlab='', ylab='', lwd=2, col=COL[[medium]], las=0, log='x')
lines(t, log10(mem_comm), lty=1, lwd=2, col=COL_GREEN)
lines(t, log10(mem_cult), lty=1, lwd=2, col=COL_RED)
lines(t, log10(mem), lty=1, lwd=2, col='black')
legend('topright', bty='n', legend='Log x-scale', inset=c(0.1,0.2))
}
if (SAVE_PLOTS) dev.off()
}
plot_model_fit <- function(medium, name, model, predict, log='') {
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
df <- make_df_for_fit(medium)
t <- df$t
y <- log10(df$y)
yhat <- log10(predict)
par(mar=c(3,3,1,1))
plot(t, y, type='p', bty='n', lwd=2, xlab='', ylab='', col=COL[[medium]], las=0, log=log)
mtext('Days since death', side=1, line=2, cex=0.6)
mtext('Mention frequency [log10]', side=2, line=2, cex=0.6)
lines(t, yhat, lwd=2)
aic <- AIC(model)
r2 <- cor(yhat, y)^2
if (log=='x') text(400, max(y),
sprintf('R² = %.3f\nAIC = %.0f', r2, aic),
adj=c(1,1), cex=1)
else text(200, max(y), name, adj=c(0.5,1), cex=1.4)
}
plot_all_model_fits <- function(medium) {
models <- list(
"Exponential"=exp_fit(medium),
"Hyperbolic"=hyperbolic_fit(medium),
"Logarithmic"=log_fit(medium),
"Power"=power_fit(medium),
"Hyperbolic-power"=hyperbolic_power_fit(medium),
"Log-normal-inspired"=lognorm_constrained_fit(medium),
"Exponential-power"=exp_power_fit(medium),
"Biexponential"=biexp_fit(medium)
)
if (SAVE_PLOTS) pdf(sprintf('%s/all_model_fits_%s.pdf', PLOTDIR, medium),
width=3.5, height=9, pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mfcol=c(8,2))
for (log in c('', 'x')) {
for (name in names(models)) {
model <- models[[name]]
plot_model_fit(medium, name, model, exp(predict(model)), log=log)
}
}
if (SAVE_PLOTS) dev.off()
}
plot_comm_vs_cult_mem <- function(medium, biexp_coefs=NULL) {
if (SAVE_PLOTS) pdf(sprintf('%s/comm_vs_cult_mem_%s.pdf', PLOTDIR, medium), width=3.4, height=2.4,
pointsize=8, family='Helvetica', useDingbats=FALSE)
par(mar=c(4,4,1,1))
if (medium == 'NEWS') {
x <- x_N
} else if (medium == 'TWITTER') {
x <- x_T
} else {
stop("medium must be NEWS or TWITTER!")
}
max_day <- 40
t <- seq(0, max_day-1, 0.01)
N <- biexp_coefs['N']
p <- biexp_coefs['p']
r <- biexp_coefs['r']
q <- biexp_coefs['q']
u <- function(t, N, p, r, q) N * exp(-(p+r)*t)
v <- function(t, N, p, r, q) N/(p-q+r)*r * (exp(-q*t) - exp(-(p+r)*t))
mem_comm <- u(t, N, p, r, q)
mem_cult <- v(t, N, p, r, q)
ratio <- mem_cult / (mem_comm + mem_cult)
plot(t+1, ratio, type='l', bty='n', col=COL[[medium]], lwd=2,
xlab='Days since death', ylab='Fraction of cultural memory')
for (perc in c(0.5, 0.99)) {
idx <- min(which(ratio>=perc))
points(t[idx]+1, ratio[idx], lwd=3, pch=20, col='black', cex=2)
segments(-5, ratio[idx], t[idx]+1, ratio[idx], col='black', lty=2)
segments(t[idx]+1, -5, t[idx]+1, ratio[idx], col='black', lty=2)
text(t[idx]+1, ratio[idx], adj=c(-0.2, 1.2),
labels=sprintf('%.0f%% after\n%.0f days', 100*perc, ceiling(t[idx]+1)))
}
if (SAVE_PLOTS) dev.off()
}
overlay_time_series <- function(X, medium) {
matplot(-N:N, t(X), type='l', lty=1, lwd=2, col=COL_XLIGHT[[medium]], xlab='Days since death',
ylab='Fraction mentioning docs [log10]', ylim=range(X), main=medium, bty='n')
}
smoothed_density <- function(x, bw=0.05, range=c(0,1)) {
# Reflect the data to avoid drops at boundaries.
left <- -(x-range[1]) + range[1]
right <- left + 2*(range[2]-range[1])
x <- c(left, x, right)
est <- bkde(x, bandwidth=bw, range=range(x), gridsize=1000)
# Consider only the original range.
idx <- which(est$x >= range[1] & est$x <= range[2])
est$x <- est$x[idx]
# Renormalize.
est$y <- 3*est$y[idx]
est
}
# Smoothed histograms of curve properties for the different people groups.
plot_smoothed_densities_raw <- function(stats, medium, prop, groups, main=NULL) {
nbw <- 20
chars <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
par(mfrow=c(length(groups), length(chars)), mar=c(4,4,2,2))
for (char in chars) {
# Find the extreme y-value to be plotted.
ymin <- Inf
ymax <- -Inf
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- stats[mask,char]
sm <- smoothed_density(x, bw=(max(x)-min(x))/nbw, range=range(x))$y
ymin <- min(ymin, min(sm))
ymax <- max(ymax, max(sm))
}
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- stats[mask,char]
est <- smoothed_density(x, bw=(max(x)-min(x))/nbw, range=range(x))
# The estimate without reflected data, that is it drops at the boundaries.
est_ <- bkde(x, bandwidth=(max(x)-min(x))/nbw, range=range(x))
plot(est$x, est$y, ylim=c(ymin,ymax), type='l', lwd=2,
main=if (is.null(main)) group else main,
col=COL[[medium]], xlab=FANCY_CURVE_CHAR_NAMES[char], ylab='Density', bty='n')
rug(x)
}
}
par(.default_par)
}
plot_chars_by_group <- function(stats, medium, prop, groups) {
chars <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
par(mfrow=c(1, length(chars)), mar=c(10,4,2,2))
for (char in chars) {
char_rank <- sprintf('%s_rank', char)
grouped <- split(stats[,char_rank]/nrow(stats), prop)[as.character(groups)]
boxplot(grouped, las=3, notch=TRUE, main=FANCY_CURVE_CHAR_NAMES[char],
col=COL_LIGHT[[medium]], bty='n', ylab='Relative rank')
}
par(.default_par)
}
smoothed_2d_density <- function(x, bw) {
# Reflect the data to avoid drops at boundaries.
x <- rbind(cbind( -x[,1], -x[,2]),
cbind( -x[,1], x[,2]),
cbind( -x[,1], 2-x[,2]),
cbind( x[,1], -x[,2]),
cbind( x[,1], x[,2]),
cbind( x[,1], 2-x[,2]),
cbind(2-x[,1], -x[,2]),
cbind(2-x[,1], x[,2]),
cbind(2-x[,1], 2-x[,2]))
est <- bkde2D(x, range.x=list(c(-1,2), c(-1,2)), bandwidth=c(bw,bw), gridsize=c(153,153))
# Consider only the original range [0,1].
idx <- which(est$x1 >= 0 & est$x1 <= 1)
est$x1 <- est$x1[idx]
est$x2 <- est$x2[idx]
# Renormalize.
est$fhat <- 9*est$fhat[idx,idx]
est
}
plot_smoothed_2d_densities <- function(stats, medium, char1, char2, prop, groups, main=NULL) {
bw <- 0.1
char1_rank <- sprintf('%s_rank', char1)
char2_rank <- sprintf('%s_rank', char2)
# Find the maximum y-value to be plotted.
ymax <- -Inf
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- cbind(stats[mask, char1_rank], stats[mask, char2_rank])/length(mask)
ymax <- max(ymax, max(smoothed_2d_density(x, bw=bw)$fhat))
}
for (group in groups) {
mask <- is.finite(stats[,prop]) & stats[,prop] %in% group
x <- cbind(stats[mask, char1_rank], stats[mask, char2_rank])/length(mask)
est <- smoothed_2d_density(x, bw=bw)
filled.contour(est$x1, est$x2, est$fhat, zlim=c(0,ymax),
color=function(n) gray.colors(n,0,1),
plot.title=title(main=sprintf('%s\n(%s)', if (is.null(main)) group else main, medium),
xlab=char1, ylab=char2))
par(.default_par)
}
}
cluster <- function(stats, feats) {
input <- cbind(stats[,feats])
colnames(input) <- feats
# z-score normalize.
input <- t(t(input) - colMeans(input))
input <- t(t(input) / apply(input, 2, sd))
K <- 2:30
kmeans_all <- mclapply(K, function(k) { set.seed(1); kmeans(input, k, nstart=20) })
# Reorder the clusters.
for (k in K) {
km <- kmeans_all[[which(K==k)]]
ord <- order(km$size, decreasing=TRUE)
ord_map <- 1:k; names(ord_map) <- ord
km$size <- km$size[ord]
km$withinss <- km$withinss[ord]
km$centers <- km$centers[ord,]
rownames(km$centers) <- 1:k
km$cluster <- ord_map[as.character(km$cluster)]
kmeans_all[[which(K==k)]] <- km
}
list(clustering=kmeans_all, diss=dist(input), K=K)
}
plot_sil_width <- function(medium, kmeans_all, diss) {
avg_sil_width_kmeans <- unlist(lapply(kmeans_all, function(km) summary(silhouette(km$cluster, diss^2))$avg.width))
plot(2:(length(kmeans_all)+1), avg_sil_width_kmeans, type='b', xlab='Number of clusters',
ylab='Avg. silhouette width', main=medium, col=COL[[medium]], lwd=2, bty='n')
}
plot_clustered_time_series <- function(medium, kmeans, X) {
k <- nrow(kmeans$centers)
par(mfrow=c(1, k))
for (j in 1:k) {
matplot(-N:N, t(X[kmeans$cluster==j,]), type='l', lty=1, lwd=2, col=COL_XLIGHT[[medium]],
xlab='Days since death', ylab='% mentioning docs [log10]', ylim=range(X),
main=sprintf('%s cluster %s', medium, j))
}
par(.default_par)
}
boot_ci <- function(x, func, R=5000) {
bo <- boot(x, statistic=function(d, i) func(d[i]), R=R)
ci <- boot.ci(bo, conf=0.95, type="basic")$basic[4:5]
if (is.null(ci)) {
upper <- lower <- NA
} else {
lower <- ci[1]
upper <- ci[2]
}
unlist(list(upper=upper, point_est=func(x), lower=lower))
}
make_regression_data <- function(medium) {
if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
stats <- if (medium=='NEWS') stats_N else stats_T
data <- stats[stats$gender != ''
& stats$death_type != 'unknown'
& !is.na(stats$type_group) # This removes only 4 people.
& !is.na(stats$age_group)
& stats$age %in% 20:99,]
data$age_group <- factor(data$age_group)
# Mean (median) age is 71 (74), so use the age bracket 70-79 as the reference level.
base_age_group <- '70'
base_death_type <- 'unknown'
base_death_type <- 'natural'
base_type_group <- 'art' # 'general fame'
base_gender <- 'Male'
base_anglo <- 'anglo'
data$age_group <- relevel(data$age_group, base_age_group)
data$death_type <- relevel(data$death_type, base_death_type)
data$type_group <- relevel(data$type_group, base_type_group)
data$gender <- relevel(data$gender, base_gender)
data$anglo <- relevel(data$anglo, base_anglo)
N <- dim(data)[1]
relranks <- (0:(N-1))/(N-1) - 0.5
data$mean_before_relrank[order(data$mean_before_rank)] <- relranks
data$peak_mean_boost_relrank[order(data$peak_mean_boost_rank)] <- relranks
data$perm_boost_relrank[order(data$perm_boost_rank)] <- relranks
return(data)
}
make_regression_data_T_vs_N <- function() {
lmdata <- make_regression_data('NEWS')
lmdata$mean_before_relrank_diff <- lmdata_N$mean_before_relrank - lmdata_T$mean_before_relrank
lmdata$peak_mean_boost_diff <- lmdata_N$peak_mean_boost - lmdata_T$peak_mean_boost
lmdata$perm_boost_diff <- lmdata_N$perm_boost - lmdata_T$perm_boost
lmdata$peak_mean_boost_relrank_diff <- lmdata_N$peak_mean_boost_relrank - lmdata_T$peak_mean_boost_relrank
lmdata$perm_boost_relrank_diff <- lmdata_N$perm_boost_relrank - lmdata_T$perm_boost_relrank
return(lmdata)
}
# The real uncertainty in the summed coefficients is larger; need to add separate SEs and entries of vcov
# matrix: https://biologyforfun.wordpress.com/2017/05/17/adding-standard-errors-for-interaction-terms/comment-page-1/
compound_se_for_lm <- function(mod, interact=NULL, base_age=NULL) {
X <- as.data.frame(vcov(mod))
idx <- rownames(X)[grep('age_group..$', rownames(X))]
X[sprintf('age_group%s:%s', base_age, interact),] <- 0
X[,sprintf('age_group%s:%s', base_age, interact)] <- 0
sapply(idx, function(i) {
sub_idx <- c(i, if (is.null(interact)) NULL else c(interact, sprintf('%s:%s', i, interact)))
sqrt(sum(X[sub_idx, sub_idx]))
})
}
run_regression_analysis <- function(medium, ranks) {
if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
lmdata <- if (medium == 'NEWS') lmdata_N else lmdata_T
predictors <- 'mean_before_relrank + age_group + death_type + gender + anglo + type_group'
suffix <- if (ranks) '_relrank' else ''
mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s ~ %s', suffix, predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf( 'perm_boost%s ~ %s', suffix, predictors)), lmdata)
return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}
run_regression_analysis_for_death_types <- function(medium, ranks) {
if (medium != 'NEWS' && medium != 'TWITTER') stop('Medium must be \'NEWS\' or \'TWITTER\'')
lmdata <- if (medium == 'NEWS') lmdata_N else lmdata_T
base_death_type <- levels(lmdata$death_type)[1]
# Compare natural vs. unnatural death for the different age brackets (0 to omit the
# intercept and include it instead as the coefficient of the base age).
predictors <- '0 + mean_before_relrank + age_group + death_type + gender + anglo + type_group + age_group:death_type'
interact <- sprintf('death_type%s', if (base_death_type=='natural') 'unnatural' else 'natural')
suffix <- if (ranks) '_relrank' else ''
mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s ~ %s', suffix, predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf( 'perm_boost%s ~ %s', suffix, predictors)), lmdata)
return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}
run_regression_analysis_T_vs_N <- function(ranks) {
lmdata <- make_regression_data_T_vs_N()
predictors <- 'mean_before_relrank_diff + age_group + death_type + gender + anglo + type_group'
suffix <- if (ranks) '_relrank' else ''
mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s_diff ~ %s', suffix, predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf('perm_boost%s_diff ~ %s', suffix, predictors)), lmdata)
return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}
run_regression_analysis_T_vs_N_for_death_types <- function(ranks) {
lmdata <- make_regression_data_T_vs_N()
predictors <- '0 + mean_before_relrank_diff + age_group + death_type + gender + anglo + type_group + age_group:death_type'
suffix <- if (ranks) '_relrank' else ''
mod_peak <- lm(as.formula(sprintf('peak_mean_boost%s_diff ~ %s', suffix, predictors)), lmdata)
mod_perm <- lm(as.formula(sprintf('perm_boost%s_diff ~ %s', suffix, predictors)), lmdata)
return(list(mod_peak=mod_peak, mod_perm=mod_perm))
}
plot_age_coeffs_for_death_types <- function(mod, medium, outcome, base_age, ranks,
ylim=NULL, col=COL[[medium]]) {
summary <- coef(summary(mod))
# Non-interaction terms (i.e., natural death).
idx <- grep('^age_group..$', rownames(summary))
decades <- as.numeric(sub('.*(\\d\\d).*', '\\1', rownames(summary))[idx])
ord <- order(decades)
decades <- decades[ord]
age_coef_natural <- summary[idx,1][ord]
se_natural <- compound_se_for_lm(mod)[ord]
# Interaction terms (i.e., unnatural death).
idx <- grep('^age_group..:death_typeunnatural$', rownames(summary))
summary[idx,1] <- summary[idx,1] + summary['death_typeunnatural',1]
rownames(summary)[rownames(summary)=='death_typeunnatural'] <- sprintf('age_group%s:death_typeunnatural', base_age)
# Add age_group baseline coefs to interaction coefs.
idx <- grep('^age_group..:death_typeunnatural$', rownames(summary))
summary[idx,1] <- summary[idx,1] + summary[grep('age_group..$', rownames(summary)), 1]
age_coef_unnatural <- summary[idx,1][ord]
se_unnatural <- compound_se_for_lm(mod, 'death_typeunnatural', base_age)[ord]
if (is.null(ylim)) ylim <- range(c(age_coef_natural - 2*se_natural,
age_coef_natural + 2*se_natural,
age_coef_unnatural - 2*se_unnatural,
age_coef_unnatural + 2*se_unnatural), na.rm=TRUE)
col_unnat <- 'gray'
suffix <- if (ranks) '_relrank' else ''
if (outcome == 'peak_mean_boost') {
main <- sprintf('Short-term boost', medium)
xlab <- 'Average boost'
outfile <- sprintf('%s/age_coefs_%s_%s%s.pdf', PLOTDIR, medium, outcome, suffix)
} else if (outcome == 'perm_boost') {
main <- sprintf('Long-term boost', medium)
xlab <- 'Average boost'
outfile <- sprintf('%s/age_coefs_%s_%s%s.pdf', PLOTDIR, medium, outcome, suffix)
} else if (outcome == 'peak_mean_boost_diff') {
main <- 'Short-term boost'
xlab <- 'Average boost difference'
outfile <- sprintf('%s/age_coefs_NEWS-VS_TWITTER_%s%s.pdf', PLOTDIR, outcome, suffix)
} else if (outcome == 'perm_boost_diff') {
main <- 'Long-term boost'
xlab <- 'Average boost difference'
outfile <- sprintf('%s/age_coefs_NEWS-VS_TWITTER_%s%s.pdf', PLOTDIR, outcome, suffix)
} else {
stop('Illegal outcome name')
}
if (SAVE_PLOTS) cairo_pdf(outfile, width=3.4, height=2, pointsize=8, family='Helvetica')
par(mar=c(3,3,1,1))
plot(decades+6, age_coef_unnatural, type='b', panel.first=abline(h=0, col='gray'),
ylim=ylim, xlim=c(min(decades), max(decades)+10), bty='n', xaxt='n', yaxt='n',
main=main, xlab='', ylab='', lwd=2, col=col_unnat)
dispersion(decades+6, age_coef_unnatural, 2*se_unnatural, col=col_unnat, arrow.cap=0.005)
axis(1, at=c(decades, max(decades)+10), line=0)
axis(2, line=0)
mtext('Age at death', 1, line=2)
mtext(xlab, 2, line=2)
points(decades+4, age_coef_natural, type='b', lwd=2, col=col)
dispersion(decades+4, age_coef_natural, 2*se_natural, col=col, arrow.cap=0.005)
legend('bottomleft', c('Natural death', 'Unnatural death'), col=c(col, col_unnat), lty=1, lwd=2, horiz=TRUE, bty='n')
if (SAVE_PLOTS) dev.off()
}
print_regression_table <- function(mods_N, mods_T, ranks) {
age_names <- sprintf("Age: %d--%d", seq(20,90,10)[-6], seq(29,99,10)[-6])
type_names <- levels(lmdata_N$type_group)[-1]
table <- texreg(list(mods_N$mod_peak, mods_T$mod_peak, mods_N$mod_perm, mods_T$mod_perm), dcolumn=TRUE, booktabs=TRUE,
use.packages=FALSE, digits=3, leading.zero=FALSE, single.row=TRUE, stars = c(0.001,0.01,0.05),
custom.model.names=c("\\shortstack{Short-term boost\\\\(News)}",
"\\shortstack{Short-term boost\\\\(Twitter)}",
"\\shortstack{Long-term boost\\\\(News)}",
"\\shortstack{Long-term boost\\\\(Twitter)}"),
custom.coef.names=c("(Intercept)", "Pre-mortem mean (relative rank)", age_names, "Manner of death: unnatural",
"Gender: female", paste("Language:", c("non-anglophone", "unknown")),
paste("Notability type:", type_names)),
reorder.coef=c(1,2,10,12:13,11,14:18,3:9), table=FALSE)
cat(table, file=sprintf('%s/lm_coef_T_and_N%s.tex', TABLEDIR, if (ranks) '_relrank' else ''), sep="\n")
}
print_regression_table_T_vs_N <- function(mods, ranks) {
age_names <- sprintf("Age: %d--%d", seq(20,90,10)[-6], seq(29,99,10)[-6])
type_names <- levels(lmdata_N$type_group)[-1]
table <- texreg(list(mods$mod_peak, mods$mod_perm), dcolumn=TRUE, booktabs=TRUE,
use.packages=FALSE, digits=3, leading.zero=FALSE, single.row=TRUE, stars = c(0.001,0.01,0.05),
label = "table:coefficients News vs. Twitter", custom.model.names=c("Short-term boost", "Long-term boost"),
custom.coef.names=c("(Intercept)", "Pre-mortem mean (relative-rank diff.)", age_names,
"Manner of death: unnatural",
"Gender: female", paste("Language:", c("non-anglophone", "unknown")),
paste("Notability type:", type_names)),
reorder.coef=c(1,2,10,12:13,11,14:18,3:9), table=FALSE)
cat(table, file=sprintf('%s/lm_coef_T_vs_N%s.tex', TABLEDIR, if (ranks) '_relrank' else ''), sep="\n")
}
print_model_description <- function(medium, depvar, ranks) {
cat('\n\n#####################################################\n')
cat('REGRESSION MODEL\n')
cat(sprintf('Medium: %s\n', medium))
cat(sprintf('Dependent variable: %s\n', depvar))
cat(sprintf('Transformation on dependent variable: %s\n', if (ranks) 'RELATIVE RANKS' else 'NONE'))
cat('#####################################################\n')
}
load_doc_length_regression_data <- function() {
file <- sprintf('%s/RData/log_doc_length_ratios.RData', DATADIR)
if (!file.exists(file)) {
# Note: this uses all non-Twitter documents, not only those from the news domains. Also,
# it uses only a set of 2638 people that were used in an early stage of the project
# (meeting the criterion "min_num_active_days=50"), which are then reduced to the subset
# of 579 people that are also used in the main regression analysis about post-mortem mention
# frequency.
doc_len_all <- read.table(pipe(sprintf('gunzip -c %s/mean_doc_length_per_day.tsv.gz', DATADIR)),
sep='\t', as.is=TRUE, col.names=c('mid', 'date', 'numWords'))
ok_mids <- intersect(unique(doc_len_all$mid), unique(rownames(lmdata_N)))
doc_len <- doc_len_all[doc_len_all$mid %in% ok_mids,]
doc_len$rel_date <- as.numeric(as.Date(doc_len$date) - as.Date(death_dates[doc_len$mid]))
feature <- 'numWords'
x <- do.call(rbind, mclapply(split(doc_len, doc_len$mid), function(l) {
dod <- as.Date(death_dates[l$mid[1]])
offset <- as.numeric(MAX_DATE-MIN_DATE) + 1
y <- rep(NA, NRELDATES)
# Log.
y[l$rel_date + offset] <- log(l[,feature])
y[as.numeric(as.Date(EMPTY_DAYS)-dod) + offset] <- NA
return(y)
}, mc.cores=6))
extreme <- (NRELDATES-1)/2
colnames(x) <- -extreme:extreme
x <- x[, colnames(x) %in% -100:N]
premeans <- rowMeans(x[,colnames(x) <= -30], na.rm=TRUE)
x_norm <- x - premeans
regdata <- cbind(lmdata_N[rownames(x_norm),], x_norm)
save(regdata, file=file)
} else {
load(file)
}
return(regdata)
}
plot_doc_length_increase <- function(regdata, predictors, ylab=NULL) {
coefs <- do.call(rbind,
lapply(-100:N,
function(t) summary(lm(as.formula(sprintf('`%s` ~ %s', t, predictors)),
regdata))$coefficients[1,1:2]))
est <- coefs[,1]
lo <- coefs[,1]-2*coefs[,2]
hi <- coefs[,1]+2*coefs[,2]
# Natural deaths.
plot(-100:N, exp(est)-1, type='l', lwd=2, col='#000000', bty='n', axes=FALSE,
ylim=c(-0.7,0.7),xlab='Days since death', ylab=ylab,
main='Mean relative document length increase', cex.main=1,
panel.first=c(abline(v=0, h=0, col='gray'), #abline(v=c(30), col='red'),
polygon(x=c(-100:N, rev(-100:N)), y=c(lo, rev(hi)), col='#00000040', border=NA),
axis(1, at=seq(-90,360,30), las=2), axis(2)))
# Unnatural deaths.
# predictors <- 'mean_before_relrank + age_group + death_type + anglo'
# coefs <- do.call(rbind,
# lapply(-100:N,
# function(t) sum(summary(lm(as.formula(sprintf('`%s` ~ %s', t, predictors)),
# regdata))$coefficients[c(1,10),1])))
# plot(-100:N, exp(coefs[,1])-1, type='l', lwd=2, col='#000000', bty='n', axes=FALSE, ylim=c(-0.7,0.7),
# panel.first=c(abline(v=0, h=0, col='gray'), axis(1, at=seq(-90,360,30), las=2), axis(2)))
}
draw_mega_figure <- function() {
days <- -100:360
feat_cols <- c(mean_before=COL_DARKBLUE, peak_mean_boost=COL_MAGENTA, perm_boost=COL_GREEN, time_till_half=COL_RED)
bg <- function(col) rect(par("usr")[1], par("usr")[3], par("usr")[2], par("usr")[4], col=col, border=NA)
mar <- 0.2
if (SAVE_PLOTS) cairo_pdf(sprintf('%s/megafigure.pdf', PLOTDIR), width=8, height=8, pointsize=6, family='Helvetica')
par(.default_par)
plot.new()
rel_line_height <- par('mai')[1]/par('mar')[1]/par('din')
par(fig=c(0.59, 0.91, 0.09, 0.41), new=TRUE)
par(mar=c(0,0,0,0))
confusion <- table(as.factor(km_N$cluster), as.factor(km_T$cluster))
palmat <- color.scale(confusion^0.5, cs1=c(1,1), cs2=c(1,0.2), cs3=c(1,0.2))
# Must fix the function definition of color2D.matplot: do fix(color2D.matplot) and replace "f" with "d" on line 101:
# https://stackoverflow.com/questions/44522895/plot-color-matrix-with-numbers-in-r
color2D.matplot(confusion, cellcolors=palmat, main='', show.values=1, vcol=rgb(0,0,0), axes=FALSE, vcex=1.2)
feats <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
# bar_range <- 1.1 * range(c(km_N$centers, km_N$centers))
bar_range <- c(-2.1, 5.2)
for (medium in c('NEWS', 'TWITTER')) {
if (medium == 'NEWS') {
kmeans <- km_N
X <- X_N
x <- x_N
fig <- function(j) c(0.51, 0.59, (4-j)*0.08+0.09, (5-j)*0.08+0.09)
fig_incr <- c(mar*rel_line_height[1], mar*rel_line_height[1], 0, 0)
fig_bar <- function(j) c(0.91, 0.99, (4-j)*0.08+0.09, (5-j)*0.08+0.09)
} else {
kmeans <- km_T
X <- X_T
x <- x_T
fig <- function(j) c(j*0.08+0.51, (j+1)*0.08+0.51, 0.41, 0.49)
fig_incr <- c(0, 0, -mar*rel_line_height[2], -mar*rel_line_height[2])
fig_bar <- function(j) c(j*0.08+0.51, (j+1)*0.08+0.51, 0.01, 0.09)
}
cluster_sizes <- kmeans$size
y <- do.call(cbind, lapply(1:4, function(j) colMeans(X[kmeans$cluster==j, colnames(X) %in% days], na.rm=TRUE)))
for (j in 1:4) {
# bar_range <- 1.1 * range(kmeans$centers)
par(fig=fig(j)-fig_incr, new=TRUE, mar=c(mar, mar, mar, mar))
plot(days, y[,j], type='l', ylim=range(y), las=2, col=COL[[medium]], lwd=2, xlab='', ylab='', xaxt='n', yaxt='n',
bty='n', panel.first=bg("#eeeeee"))
text(max(days), max(y), sprintf('C%d', j), adj=c(1,1), cex=1)
par(fig=fig_bar(j)+fig_incr, new=TRUE, mar=c(mar, mar, mar, mar))
barplot(kmeans$centers[j,], main='', ylab='', col=feat_cols, names.arg='', ylim=bar_range, yaxt='n', border=NA,
bty='n', panel.first=bg("#eeeeee"))
if (medium=='TWITTER' && j==4) {
mtext('z-scores', side=4, line=1.8, cex=0.5)
axis(4, at=seq(-2,5), labels=c('-2','','0','','2','','4',''), cex.axis=0.5, cex.lab=0.5,
lwd=0.8, lwd.ticks=0.8, las=1, hadj=0)
}
text(par("usr")[2]*0.95, par("usr")[4]*0.95, adj=c(1,1), cex=0.5,
sprintf('N=%d\n(%.0f%%)', kmeans$size[j], 100*kmeans$size[j]/sum(kmeans$size)))
}
}
### Matrix row label.
par(fig=c(0.50, 1, 0.50, 0.55), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
text(0.5, 0, 'Twitter', adj=c(0.5,0), col=COL[['TWITTER']], cex=1.2)
### Matrix column label.
par(fig=c(0.40, 0.59, 0.42, 0.50), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
text(1, 0, 'News', adj=c(1,0), col=COL[['NEWS']], cex=1.2)
### Barplot legend.
par(fig=c(0.45, 0.59, 0.01, 0.09), new=TRUE)
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
legend("bottomleft", legend=c(' pre-mortem mean', ' short-term boost', ' long-term boost', ' halving time'),
pch=15, pt.cex=3, cex=0.7, y.intersp=1.7, bty='n', col=feat_cols, text.col=feat_cols)
for (i in 3:1) {
delta <- 0.007
par(fig=c(i*delta, 0.4+i*delta, i*delta, 0.4+i*delta), new=TRUE, mar=c(5,5,0,0))
plot(NULL, xaxt='n', yaxt='n', xlab='', ylab='', panel.first=bg('white'), xlim=0:1, ylim=0:1)
}
par(fig=c(0, 0.50, 0.43, 0.59), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', ylab='', xlab='', xlim=0:1, ylim=0:1)
text(0.5, 0, sprintf('Mention time series (N=%d)', dim(X)[1]), adj=c(0.5,0), cex=0.8)
par(fig=c(0, 0.40, 0, 0.40), new=TRUE, mar=c(5,5,0,0))
name <- 'Whitney_Houston'
# name <- 'Donna_Summer'
# name <- 'Amy_Winehouse'
medium <- 'NEWS'
if (medium == 'NEWS') {
X <- X_N
x <- x_N
num_art <- num_art_N
} else {
X <- X_T
x <- x_T
num_art <- num_art_T
}
cex <- 0.65 # 0.85
mid <- wiki_to_mid[sprintf('<http://en.wikipedia.org/wiki/%s>', name)]
raw <- log10(x[mid, colnames(x) %in% days])
smoothed <- X[mid, colnames(X) %in% days]
ymin <- min(raw[is.finite(raw)], na.rm=TRUE)
col <- as.vector(col2rgb(COL[[medium]])/255)
col <- rgb(col[1], col[2], col[3], 0.2)
plot(days, raw, col=col, main='', pch=20, cex=2, cex.axis=0.7, cex.lab=0.7,
bty='o', xlab='Days since death', ylab='Fraction mentioning documents [log10]', ylim=c(ymin,-1),
panel.first=c(bg('white'), abline(v=0, col='gray', lty=2)))
lines(names(smoothed), smoothed, col=COL[[medium]], lwd=2)
text(max(days), -1, sprintf('%s', gsub('_', ' ', name)), adj=c(1,1), cex=1) # cex=1.5)
y_before <- stats_N[mid, 'mean_before']
y_peak <- y_before + stats_N[mid, 'peak_mean_boost']
y_perm <- y_before + stats_N[mid, 'perm_boost']
x_tth <- stats_N[mid, 'time_till_half'] * max(as.numeric(colnames(X)))
y_tth <- mean(c(y_peak, y_perm))
# mean before
segments(x0=min(days), y0=y_before, x1=-30, lwd=2, col=feat_cols['mean_before'])
text(-60, y_before, 'pre-mortem\nmean', srt=0, adj=c(0.5,1.5), col=feat_cols['mean_before'], cex=cex)
# peak
segments(x0=0, y0=y_peak, x1=30)
arrows(x0=-40, y0=y_before, y1=y_peak, length=0.05, code=3, lwd=2, col=feat_cols['peak_mean_boost'])
text(-70, mean(c(y_before, y_peak)), 'short-\nterm\nboost', srt=0, adj=c(0.5,0.5), col=feat_cols['peak_mean_boost'], cex=cex)
# perm
segments(x0=30, y0=y_perm, x1=max(days))
arrows(x0=100, y0=y_before, y1=y_perm, length=0.05, code=3, lwd=2, col=feat_cols['perm_boost'])
text(110, mean(c(y_before, y_perm)), 'long-term boost', srt=0, adj=c(0,0.5), col=feat_cols['perm_boost'], cex=cex)
# tth
arrows(x0=1, y0=y_tth, x1=x_tth, length=0.05, code=3, lwd=2, col=feat_cols['time_till_half'])
text(x_tth+10, y_tth, 'halving time', srt=0, adj=c(0,0.5), col=feat_cols['time_till_half'], cex=cex)
### Arrow from example to cluster.
par(fig=c(0.40, 0.51, 0, 0.45), new=TRUE, mar=c(0,0,0,0))
plot(NULL, xaxt='n', yaxt='n', bty='n', xlab='', ylab='', xlim=0:1, ylim=0:1)
col <- 'gray50'
arrows(x0=0, y0=0.5, x1=0.5, length=0.05, code=0, lwd=5, col=col, ljoin=1, lend=2)
arrows(x0=0.5, y0=0.5, y1=0.40, length=0.05, code=0, lwd=5, col=col, ljoin=1, lend=2)
arrows(x0=0.5, y0=0.40, x1=0.98, length=0.2, code=2, lwd=5, col=col, ljoin=1, lend=1)
text(0.50, 0.53, 'Clustering', srt=0, adj=c(0,0.5), col=col, cex=1.5, font=2, srt=90)
if (SAVE_PLOTS) dev.off()
}
num_art_N <- get_num_art('NEWS')
num_art_T <- get_num_art('TWITTER')
data_N <- get_mention_freq_table('NEWS')
data_T <- get_mention_freq_table('TWITTER')
x_N <- get_rel_date_matrix('NEWS', data_N, num_art_N, CHUNK_SIZE)
x_T <- get_rel_date_matrix('TWITTER', data_T, num_art_T, CHUNK_SIZE)
mids_N <- filter_people('NEWS', x_N)
mids_T <- filter_people('TWITTER', x_T)
mids <- intersect(mids_N, mids_T)
x_N <- x_N[mids,]
x_T <- x_T[mids,]
X_N <- normalize_and_smooth('NEWS', x_N, num_art_N, mean_center=FALSE)
X_T <- normalize_and_smooth('TWITTER', x_T, num_art_T, mean_center=FALSE)
stats_all <- load_stats_all()
stats_N <- load_stats('NEWS', x_N, X_N, num_art_N)
stats_T <- load_stats('TWITTER', x_T, X_T, num_art_T)
lmdata_N <- make_regression_data('NEWS')
lmdata_T <- make_regression_data('TWITTER')
BASE_AGE <- levels(lmdata_N$age_group)[1]
# stats_N and stats_T contain the same people, so we arbitrarily use stats_N here.
natural <- which(stats_N$natural_death == 1)
unnatural <- which(stats_N$unnatural_death == 1)
# Are there any people with ambiguous causes of death? -- No!
length(intersect(natural, unnatural))
cluster_feats <- c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')
cluster_result_N <- cluster(stats_N, cluster_feats)
cluster_result_T <- cluster(stats_T, cluster_feats)
k <- 4
km_N <- cluster_result_N$clustering[[which(cluster_result_N$K==k)]]
km_T <- cluster_result_T$clustering[[which(cluster_result_T$K==k)]]
plot_num_art(num_art_N, 'NEWS')
plot_num_art(num_art_T, 'TWITTER')
Figure 1: Number of documents per day in the Spinn3r corpus.
News:
tapply(num_art_N, substring(names(num_art_N),0,4), function(x) mean(x, na.rm=TRUE))
## 2008 2009 2010 2011 2012 2013 2014
## 118874.4 269481.4 359729.0 222132.4 201000.3 169738.4 167931.8
Twitter:
tapply(num_art_T, substring(names(num_art_T),0,4), function(x) mean(x, na.rm=TRUE))
## 2008 2009 2010 2011 2012 2013
## 1071.196 1097066.229 6704034.431 19725105.305 27667683.221 50809628.488
## 2014
## 36663198.418
Acute myeloid leukemia, Adrenocortical carcinoma, Alveolar rhabdomyosarcoma, Alzheimer’s disease, Amyloidosis, Amyotrophic lateral sclerosis, Anemia, Aneurysm, Aortic aneurysm, Aortic dissection, Appendix cancer, Asthma, Astrocytoma, Atherosclerosis, Atypical teratoid rhabdoid tumor, B-cell chronic lymphocytic leukemia, Bladder cancer, Bleeding, Blood disorder, Blunt trauma, Bone cancer, Bone tumor, Brain Cancer, Brain damage, Brain tumor, Breast cancer, Bronchitis, Bronchopneumonia, Cancer, Cardiac arrest, Cardiac dysrhythmia, Cardiac surgery, Cardiopulmonary Arrest, Cardiovascular disease, Cerebral hemorrhage, Cerebral infarction, Cervical cancer, Cholangiocarcinoma, Chronic kidney disease, Chronic Obstructive Pulmonary Disease, Cirrhosis, Colorectal cancer, Complication, Complications from a stroke, Complications from cardiac surgery, Complications from pneumonia, Complications of diabetes mellitus, Congenital heart defect, Coronary artery disease, Craniocerebral Trauma, Creutzfeldt–Jakob disease, Cystic fibrosis, Dementia, Dementia with Lewy bodies, Diabetes mellitus, Disease, Ebola virus disease, Emphysema, Epileptic seizure, Esophageal cancer, Gallbladder cancer, Glioblastoma multiforme, Heart Ailment, Heart failure, Heart valve disease, Heat Stroke, Hepatitis, HIV/AIDS, Hodgkin’s lymphoma, Huntington’s disease, Hypertension, Hypertensive heart disease, Hyperthermia, Illness, Infection, Influenza, Internal bleeding, Intracranial aneurysm, Intracranial hemorrhage, Kidney cancer, Laryngeal cancer, Leiomyosarcoma, Leukemia, Liver cancer, Liver disease, Liver failure, Liver tumour, Lung cancer, Lung disease, Lung Infection, Lymphoma, Malaria, Melanoma, Meningitis, Mesothelioma, Metastatic breast cancer, Metastatic Melanoma, Motor neuron disease, Multiple myeloma, Multiple organ dysfunction syndrome, Multiple organ failure, Multiple sclerosis, Multiple system atrophy, Myelodysplastic syndrome, Myocardial infarction, Natural causes, Nephropathy, Non-Hodgkin lymphoma, Old age, Oral cancer, Organ dysfunction, Ovarian cancer, Pancreatic cancer, Pancreatitis, Parkinson’s disease, Peritonitis, Pneumonia, Pneumothorax, Polycythemia, Polymyalgia rheumatica, Progressive supranuclear palsy, prolonged illness, Prostate cancer, Pulmonary edema, Pulmonary embolism, Pulmonary failure, Pulmonary fibrosis, Pyelonephritis, Renal failure, Respiratory arrest, Respiratory disease, Respiratory failure, Salivary gland neoplasm, Sepsis, Septic shock, Skin cancer, Smallpox, Squamous-cell carcinoma, Stomach cancer, Stroke, Subarachnoid hemorrhage, Subdural hematoma, Surgery, Surgical complications, T-Cell Lymphoma, Terminal illness, Throat Cancer, Thrombosis, Thrombus, Thyroid cancer, Urinary tract infection, Uterine cancer, Vascular dementia, Viral pneumonia
Accident, Accidental drug overdose, Accidental fall, Airstrike, Alcohol intoxication, Ambush, Asphyxia, Assassination, Assisted suicide, Aviation accident or incident, Ballistic trauma, Bike accident, Blast injury, Boating Accident, Bomb, Brain injury, Capital punishment, Car bomb, Carbon monoxide poisoning, Casualty of war, Cocaine overdose, Combined drug intoxication, Decapitation, Drowning, Drug overdose, Execution, Execution by firing squad, Execution-style murder, Explosion, Falling, Falling from height, Fire, Firearm, Gunshot, Hanging, Helicopter crash, Heroin overdose, Hit and run, Homicide, Improvised bombing, Injury, Killed in action, Lethal injection, Lightning, Major trauma, Motorcycle accident, Mountaineering, Murder, Murder_suicide, Murder–suicide, Poison, Poisoning, Racing Accident, Self-inflicted wound, Shark attack, Shootout, Skiing accident, Smoke inhalation, Stab wound, Stabbing, Strangling, Struck by car, Suicide, Suicide attack, Suicide by hanging, Tornado Incident, Torture, Traffic collision, Traumatic brain injury
The first column represents the set of all 33340 Freebase people who died in the same time period studied (11 June 2009 to 30 September 2014). The second column represents the 2362 people studied. The third column represents the subset of the 870 people included in the regression analysis.
(A note on age: among all people in Freebase, there are some obvious age errors [e.g., 610 years]. Even when ignoring all ages above 120, the mean changes only slightly, dropping from 76.26 to 76.22.)
table <- make_bio_table()
table
## All Included Regression
## Age
## Age N/A 10% 6% 0%
## 1st quartile 68 64 60
## Mean 76 74 70
## Median 80 77 73
## 3rd quartile 88 87 84
## Gender
## Gender N/A 27% 7% 0%
## Female 16% 17% 17%
## Male 84% 83% 83%
## Manner of death
## Manner of death N/A 76% 60% 0%
## Natural 85% 86% 88%
## Unnatural 15% 14% 12%
## Language
## Language N/A 45% 27% 14%
## Anglophone 60% 82% 80%
## Non-anglophone 40% 18% 20%
## Notability type
## Notability type N/A 1% 0% 0%
## art 40% 50% 56%
## sports 14% 14% 14%
## leadership 11% 14% 13%
## known for death 26% 16% 10%
## general fame 7% 4% 5%
## academia/engineering 2% 2% 2%
## Count 33340 2362 870
Table 1: Biographic statistics of public figures.
(a) News
plot_time_series_for_small_multiples('Benoit_Mandelbrot', 'NEWS', X_N, x_N, num_art_N, ylab=TRUE)
plot_time_series_for_small_multiples('Kashiram_Rana', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
plot_time_series_for_small_multiples('Amy_Winehouse', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
plot_time_series_for_small_multiples('George_McGovern', 'NEWS', X_N, x_N, num_art_N, ylab=FALSE)
(b) Twitter
plot_time_series_for_small_multiples('Benoit_Mandelbrot', 'TWITTER', X_T, x_T, num_art_T, ylab=TRUE)
plot_time_series_for_small_multiples('Kashiram_Rana', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
plot_time_series_for_small_multiples('Amy_Winehouse', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
plot_time_series_for_small_multiples('George_McGovern', 'TWITTER', X_T, x_T, num_art_T, ylab=FALSE)
Figure 2: Examples of mention time series for four deceased public figures.
overlay_time_series(X_N, 'NEWS')
overlay_time_series(X_T, 'TWITTER')
Figure 3: Overlay of all mention time series.
mmh_N <- plot_max_mem_hist('NEWS')
mmh_T <- plot_max_mem_hist('TWITTER')
Figure 4: Histogram of post-mortem peak days. Left: news. Right: Twitter. For each post-mortem day we count for how many individuals the mention frequency is largest on that day, out of the 100 days immediately following death. For most people, the maximum mention frequency is observed on the day after death.
The first entry of each vector corresponds to day 0 (i.e., the day of death).
h_N <- mmh_N$hist
h_T <- mmh_T$hist
max_days_N <- mmh_N$max_days
max_days_T <- mmh_T$max_days
probs_N <- (h_N$counts/sum(h_N$counts))[1:30]
probs_T <- (h_T$counts/sum(h_T$counts))[1:30]
News:
probs_N
## [1] 0.2167654530 0.2933954276 0.1109229467 0.0444538527 0.0309060119
## [6] 0.0258255715 0.0207451312 0.0114309907 0.0135478408 0.0101608806
## [11] 0.0076206605 0.0063505504 0.0050804403 0.0042337003 0.0038103302
## [16] 0.0033869602 0.0063505504 0.0008467401 0.0029635902 0.0029635902
## [21] 0.0046570703 0.0033869602 0.0021168501 0.0033869602 0.0042337003
## [26] 0.0021168501 0.0033869602 0.0012701101 0.0029635902 0.0025402202
Twitter:
probs_T
## [1] 0.2878916173 0.3302286198 0.0999153260 0.0482641829 0.0325994920
## [6] 0.0169348010 0.0156646909 0.0084674005 0.0042337003 0.0080440305
## [11] 0.0076206605 0.0055038103 0.0029635902 0.0029635902 0.0059271804
## [16] 0.0025402202 0.0055038103 0.0025402202 0.0012701101 0.0033869602
## [21] 0.0025402202 0.0004233700 0.0016934801 0.0008467401 0.0012701101
## [26] 0.0025402202 0.0055038103 0.0021168501 0.0016934801 0.0021168501
The first entry of each vector corresponds to day 0 (i.e., the day of death).
News:
cumsum(probs_N)
## [1] 0.2167655 0.5101609 0.6210838 0.6655377 0.6964437 0.7222693 0.7430144
## [8] 0.7544454 0.7679932 0.7781541 0.7857748 0.7921253 0.7972058 0.8014395
## [15] 0.8052498 0.8086367 0.8149873 0.8158340 0.8187976 0.8217612 0.8264183
## [22] 0.8298052 0.8319221 0.8353091 0.8395428 0.8416596 0.8450466 0.8463167
## [29] 0.8492803 0.8518205
Twitter:
cumsum(probs_T)
## [1] 0.2878916 0.6181202 0.7180356 0.7662997 0.7988992 0.8158340 0.8314987
## [8] 0.8399661 0.8441998 0.8522439 0.8598645 0.8653683 0.8683319 0.8712955
## [15] 0.8772227 0.8797629 0.8852667 0.8878069 0.8890771 0.8924640 0.8950042
## [22] 0.8954276 0.8971211 0.8979678 0.8992379 0.9017782 0.9072820 0.9093988
## [29] 0.9110923 0.9132091
Those whose peak day comes after day 29 tend to be those with low short-term boosts, as seen by computing the relative rank with respect to short-term boost of those with a late peak.
News:
max_rank <- length(max_days_N)-1
(summary(stats_N$peak_mean_boost_rank[max_days_N>29])-1) / max_rank
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2238 0.4439 0.4620 0.6837 0.9958
# Compare this to the fraction of people with a late peak day.
(sum(max_days_N>29)-1) / max_rank
## [1] 0.1478187
Twitter:
(summary(stats_T$peak_mean_boost_rank[max_days_T>29])-1) / max_rank
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002541 0.181279 0.430750 0.454508 0.713681 0.998306
# Compare this to the fraction of people with a late peak day.
(sum(max_days_T>29)-1) / max_rank
## [1] 0.08640407
biexp_model_N <- biexp_fit('NEWS')
biexp_model_T <- biexp_fit('TWITTER')
(a) News
plot_avg_fraction_of_mentioning_docs('NEWS', biexp_model=biexp_model_N)
(b) Twitter
plot_avg_fraction_of_mentioning_docs('TWITTER', biexp_model=biexp_model_T)
Figure 5: Average mention time series, obtained via the arithmetic mean of the individual raw mention time series of the people included in the study, for (a) news and (b) Twitter.
News:
coef(biexp_model_N)
## p N r q
## 2.365624e-01 3.462593e-05 1.489664e-02 5.124453e-04
Twitter:
coef(biexp_model_T)
## p N r q
## 2.548633e-01 9.127908e-07 7.908213e-03 4.961750e-04
plot_comm_vs_cult_mem('NEWS', biexp_coefs=coef(biexp_model_N))
plot_comm_vs_cult_mem('TWITTER', biexp_coefs=coef(biexp_model_T))
Figure 6: Proportion of cultural vs. communicative memory. On day \(t\) after death, the total collective memory according to the biexponential model fit is \(S(t)=u(t)+v(t)\), where \(u(t)\) is the communicative memory, and \(v(t)\) is the cultural memory. We plot the proportion \(v(t)/S(t)\) as a function of \(t\). Left: news. Right: Twitter.
(a) News
plot_all_model_fits('NEWS')
(b) Twitter
plot_all_model_fits('TWITTER')
## Warning in nls(log(y) ~ -log(a + b * t^c), start = list(a = a0, b = b0, :
## Convergence failure: function evaluation limit reached without convergence (9)
Figure 7: Comparison of eight models \(S(t)\) for fitting empirical mention frequencies, for (a) the news and (b) Twitter. All \(y\)-axes are logarithmic. The left and right plots in each row show the same fits, the only difference being that the left plots have linear \(x\)-axes, whereas the right plots have logarithmic \(x\)-axes. Fits are nonlinear least-squares fits obtained in log space, i.e., logarithms were taken of both the empirical data and the model \(S(t)\) before performing the least-squares optimization (see equation (5) in the paper for the case of the biexponential model). The biexponential function was introduced by Candia et al. (“The universal decay of collective memory and attention.” Nature Human Behaviour. 2019; 3(1):82-91) and is defined in equation (4) of the paper. Theoretical motivations for six of the seven remaining functions are given by Rubin and Wenzel (“One hundred years of forgetting: A quantitative description of retention.” Psychological Review. 1996; 103(4):734-760). Four of these functions are parameterized by two parameters \(a,b>0\), as follows:
The four above functions share the shortcoming of being concave (exponential, hyperbolic, logarithmic) or linear (power) when plotted on log-log axes (i.e., \(\log S(t)\) is a concave function of \(\log t\), cf. right columns), whereas the empirical curves are convex on log-log axes.
Rubin and Wenzel also proposed generalized versions of the exponential and hyperbolic functions, called exponential-power and hyperbolic-power functions, respectively, where \(t\) is replaced by the power \(t^c\), where \(c\) is a third parameter. (Analogous generalized versions of the logarithmic and power functions are not necessary, as they can already be expressed by the plain logarithmic and power functions, since \(b \log t^c = (bc) \log t\).) For \(c>0\), the exponential-power and hyperbolic-power functions, too, are concave in log-log space, but when allowing for \(b,c < 0\), they can be made convex and are thus better suited for fitting the empirical data (note that, in the following specifications, we maintain \(a,b,c > 0\), but replace \(b\) by \(-b\), and \(c\) by \(-c\)):
Finally, as the seventh function, we consider what Candia et al. refer to as the “log-normal” function, defined as \(S(t) = \exp(\log a - b \log t - c (\log t)^2)\). To recognize the fact that, although this function takes the functional form of the log-normal distribution, it is not actually used to describe a probability distribution here, we refer to the function as “log-normal-inspired”. The log-normal-inspired function, too, is concave in log-log space, but can be made convex by replacing \(c\) by \(-c\), i.e., \(S(t) = \exp(\log a - b \log t + c (\log t)^2)\). Note, however, that this results in \(S(t)\) being an increasing function of \(t\) as \(t \rightarrow \infty\), unlike the empirical data and unlike what one would require from a sound theoretical model of collective memory. We hence constrain the parameters such that the fitted function is monotonically decreasing over the modeled time range (days 1 to \(t_{\max}=400\)). Since the unconstrained function, when fitted to the empirical data, assumes a minimum at \(1<t<t_{\max}\), the monotonicity constraint is equivalent to requiring the minimum to occur at \(t=t_{\max}\), which happens for \(b=-2ct_{\max}\) and gives rise to the following model:
We quantify goodness of fit using two measures (results in the legends of the right plots): (1) via coefficients of determination (\(R^2\); computed as the squared correlation between observed and predicted values on the log scale; larger is better); (2) in order to account for the varying model complexity (the models have between two and four parameters), via Akaike’s information criterion (AIC; smaller is better). \(R^2\) and AIC result in the same ordering of the eight models, and the ordering is identical across the two media (news and Twitter). In the figure, the models are sorted, top-down, in increasing order of goodness of fit. The biexponential model provides the best fit according to both measures (\(R^2\) and AIC) and for both media (news and Twitter).
(a) News
summ <- summary(stats_N[, c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')])
colnames(summ) <- FANCY_CURVE_CHAR_NAMES[trimws(colnames(summ))]
summ
## Pre-mortem mean Short-term boost Long-term boost Halving time
## Min. :-5.851 Min. :-0.2055 Min. :-1.5490200 Min. :0.01667
## 1st Qu.:-5.835 1st Qu.: 1.1924 1st Qu.:-0.0147554 1st Qu.:0.05903
## Median :-5.819 Median : 1.9754 Median : 0.0005455 Median :0.16111
## Mean :-5.755 Mean : 1.9171 Mean : 0.0191381 Mean :0.23172
## 3rd Qu.:-5.770 3rd Qu.: 2.6687 3rd Qu.: 0.0270671 3rd Qu.:0.35833
## Max. :-3.267 Max. : 4.2103 Max. : 1.3884812 Max. :0.95833
(b) Twitter
summ <- summary(stats_T[, c('mean_before', 'peak_mean_boost', 'perm_boost', 'time_till_half')])
colnames(summ) <- FANCY_CURVE_CHAR_NAMES[trimws(colnames(summ))]
summ
## Pre-mortem mean Short-term boost Long-term boost Halving time
## Min. :-7.866 Min. :-0.0413 Min. :-0.557536 Min. :0.01667
## 1st Qu.:-7.839 1st Qu.: 1.5856 1st Qu.:-0.006985 1st Qu.:0.06667
## Median :-7.803 Median : 2.4474 Median : 0.016047 Median :0.15972
## Mean :-7.664 Mean : 2.3262 Mean : 0.055143 Mean :0.22363
## 3rd Qu.:-7.671 3rd Qu.: 3.0998 3rd Qu.: 0.077694 3rd Qu.:0.33611
## Max. :-4.576 Max. : 4.9904 Max. : 1.552516 Max. :0.88056
Table 2: Summaries of distribution of mention time series characteristics for (a) the news and (b) Twitter.
\[\\[4mm]\]
plot_smoothed_densities_raw(stats_N, 'NEWS', 'mid', list(unique(stats_N$mid)), main='NEWS')
plot_smoothed_densities_raw(stats_T, 'TWITTER', 'mid', list(unique(stats_T$mid)), main='TWITTER')
Figure 8: Kernel-smoothed density plots of mention time series characteristics for the news (top) and Twitter (bottom).
News:
boot_ci(stats_N$peak_mean_boost, median)
## upper point_est lower
## 2.031260 1.975446 1.896458
wilcox.test(stats_N$peak_mean_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$peak_mean_boost
## V = 2790570, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 1.886672 1.967571
## sample estimates:
## (pseudo)median
## 1.927002
Twitter:
boot_ci(stats_T$peak_mean_boost, median)
## upper point_est lower
## 2.500643 2.447381 2.372522
wilcox.test(stats_T$peak_mean_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_T$peak_mean_boost
## V = 2790610, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 2.309257 2.402026
## sample estimates:
## (pseudo)median
## 2.355734
wilcox.test(x=stats_N$peak_mean_boost, y=stats_T$peak_mean_boost, paired=TRUE, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$peak_mean_boost and stats_T$peak_mean_boost
## V = 477893, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.4415986 -0.3917311
## sample estimates:
## (pseudo)median
## -0.416588
News:
boot_ci(stats_N$perm_boost, median)
## upper point_est lower
## 0.0017260233 0.0005454994 -0.0009987135
wilcox.test(stats_N$perm_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$perm_boost
## V = 1560596, p-value = 6.199e-07
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 0.002285753 0.005541149
## sample estimates:
## (pseudo)median
## 0.003893564
Twitter:
boot_ci(stats_T$perm_boost, median)
## upper point_est lower
## 0.01751758 0.01604730 0.01334752
wilcox.test(stats_T$perm_boost, mu=0, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_T$perm_boost
## V = 2053579, p-value < 2.2e-16
## alternative hypothesis: true location is not equal to 0
## 95 percent confidence interval:
## 0.02424449 0.03112195
## sample estimates:
## (pseudo)median
## 0.02754178
wilcox.test(x=stats_N$perm_boost, y=stats_T$perm_boost, paired=TRUE, conf.int=TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: stats_N$perm_boost and stats_T$perm_boost
## V = 881590, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
## 95 percent confidence interval:
## -0.02542398 -0.01933168
## sample estimates:
## (pseudo)median
## -0.02231052
(a) News
plot_chars_by_group(stats_N, 'NEWS', stats_N$type_group, type_groups)
(b) Twitter
plot_chars_by_group(stats_T, 'TWITTER', stats_T$type_group, type_groups)
Figure 9: Distributions of mention time series characteristics by notability type, visualized as box plots, for (a) the news and (b) Twitter. Boxes are bounded by the first and third quartiles; whiskers extend 1.5 inter-quartile ranges beyond the first and third quartiles (or to the minimum/maximum in case they fall within 1.5 inter-quartile ranges); the center bars mark medians, with notches corresponding to 95% confidence intervals of the median.
plot_sil_width('NEWS', cluster_result_N$clustering, cluster_result_N$diss)
plot_sil_width('TWITTER', cluster_result_T$clustering, cluster_result_T$diss)
Figure 10: Average silhouette width of clusterings produced by \(k\)-means algorithm, as a function of the number \(k\) of clusters (higher is better), for \(k \in \{2, \dots, 30\}\). Both (a) in the news and (b) on Twitter, \(k=4\) is optimal.
plot_clustered_time_series('NEWS', km_N, X_N)
plot_clustered_time_series('TWITTER', km_T, X_T)
Figure 11: Overlay of time series in each cluster, for news (top) and Twitter (bottom).
X <- table(as.factor(km_N$cluster), as.factor(km_T$cluster))
chisq.test(X, simulate.p.value=TRUE, B=1e5)
##
## Pearson's Chi-squared test with simulated p-value (based on 1e+05
## replicates)
##
## data: X
## X-squared = 1739.3, df = NA, p-value = 1e-05
Null probabilites represent the case where news and Twitter are assumed to be independent (with the constraint that cluster sizes must remain constant).
The following matrix shows the log10 ratio of the empirical cluster size and the cluster size under the null model (rows are news cluster, columns are Twitter clusters). That is, positive values imply overrepresentation in the empirical data, and negative values, underrepresentation. We see that all diagonal entries are overrepresented, whereas all off-diagonal entries except two—\((3,4)\) and \((4,3)\)—are underrepresented.
X.indep <- (rowSums(X) %o% colSums(X)) / sum(X)
log10(X / X.indep)
##
## 1 2 3 4
## 1 0.12144877 -0.41341084 -0.03466453 -0.37941148
## 2 -0.31927109 0.43136894 -0.77466424 -0.76366886
## 3 -0.40569573 -1.34225550 0.68375399 0.72277810
## 4 -0.24239967 -0.89911874 0.01774628 1.06900484
To quantify the overrepresentation of the diagonal, we compute four diagonal sums (everything under the constraint that the row and column sums are fixed to the empirical values):
The minimum and maximum are computed via linear programming.
k <- nrow(X)
objective.in <- as.vector(diag(nrow=k))
const.mat <- rbind(t(diag(1,k) %x% rep(1,k)),
t(rep(1,k) %x% diag(1,k)))
const.rhs <- c(colSums(X), rowSums(X))
const.dir <- rep('==', k)
opt.max <- lp('max', objective.in, const.mat, const.dir, const.rhs)
X.max <- matrix(opt.max$solution, k, k)
opt.min <- lp('min', objective.in, const.mat, const.dir, const.rhs)
X.min <- matrix(opt.min$solution, k, k)
The values of the four above-described diagonal sums are as follows:
sums <- c(sum(diag(X.min)), sum(diag(X.indep)), sum(diag(X)), sum(diag(X.max)))
sums
## [1] 505.000 1059.347 1727.000 2236.000
Normalizing by the min-to-max span, we see that the empirical diagonal gets 71% of the way between min and max, whereas the null only gets 32%, for a factor of 2.2.
(sums - sum(diag(X.min))) / (sum(diag(X.max)) - sum(diag(X.min)))
## [1] 0.0000000 0.3202468 0.7059503 1.0000000
On the contrary, nearly all off-diagonal entries are underrepresented, with the exception of \((4,3)\), which is not significantly different from the null:
prop.test(X[4,3], sum(X), X.indep[4,3]/sum(X))
##
## 1-sample proportions test with continuity correction
##
## data: X[4, 3] out of sum(X), null probability X.indep[4, 3]/sum(X)
## X-squared = 5.0682e-29, df = 1, p-value = 1
## alternative hypothesis: true p is not equal to 0.002844932
## 95 percent confidence interval:
## 0.001359169 0.006264032
## sample estimates:
## p
## 0.00296359
and \((3,4)\), which is massively overrepresented, compared to the null:
prop.test(X[3,4], sum(X), X.indep[3,4]/sum(X))
##
## 1-sample proportions test with continuity correction
##
## data: X[3, 4] out of sum(X), null probability X.indep[3, 4]/sum(X)
## X-squared = 135.03, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.003206284
## 95 percent confidence interval:
## 0.01228142 0.02321996
## sample estimates:
## p
## 0.0169348
Note: the variable names in the code may differ from those in the paper; see mapping in lists below.
Independent variables:
mean_before)age_group)death_type)type_group)anglo)gender)Dependendent variables:
peak_mean_boost)perm_boost)for (ranks in c(FALSE, TRUE)) {
mods_N <- run_regression_analysis('NEWS', ranks)
mods_T <- run_regression_analysis('TWITTER', ranks)
print_regression_table(mods_N, mods_T, ranks)
print_model_description('NEWS', 'short-term boost', ranks)
print(summary(mods_N$mod_peak))
print_model_description('NEWS', 'long-term boost', ranks)
print(summary(mods_N$mod_perm))
print_model_description('TWITTER', 'short-term boost', ranks)
print(summary(mods_T$mod_peak))
print_model_description('TWITTER', 'long-term boost', ranks)
print(summary(mods_T$mod_perm))
}
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3921 -0.4654 0.1215 0.5730 2.0660
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.32157 0.06293 36.890 < 2e-16 ***
## mean_before_relrank 0.80417 0.09297 8.650 < 2e-16 ***
## age_group20 0.16165 0.17015 0.950 0.3423
## age_group30 0.40001 0.16740 2.390 0.0171 *
## age_group40 -0.04607 0.12599 -0.366 0.7147
## age_group50 -0.07459 0.09893 -0.754 0.4511
## age_group60 -0.10927 0.08172 -1.337 0.1816
## age_group80 0.02162 0.07826 0.276 0.7824
## age_group90 0.17414 0.09779 1.781 0.0753 .
## death_typeunnatural 0.61845 0.09460 6.537 1.08e-10 ***
## genderFemale 0.08287 0.07213 1.149 0.2509
## anglonon_anglo -0.31567 0.07378 -4.279 2.09e-05 ***
## anglounknown -0.44551 0.08550 -5.211 2.36e-07 ***
## type_groupacademia/engineering 0.18120 0.19745 0.918 0.3590
## type_groupgeneral fame 0.06993 0.12391 0.564 0.5727
## type_groupknown for death -0.10659 0.09933 -1.073 0.2835
## type_groupleadership 0.15217 0.08285 1.837 0.0666 .
## type_groupsports 0.04886 0.08336 0.586 0.5580
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7725 on 852 degrees of freedom
## Multiple R-squared: 0.2131, Adjusted R-squared: 0.1974
## F-statistic: 13.57 on 17 and 852 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92517 -0.07187 -0.02152 0.03771 1.19435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0879028 0.0137837 6.377 2.95e-10 ***
## mean_before_relrank 0.0305324 0.0203625 1.499 0.13413
## age_group20 0.0603247 0.0372663 1.619 0.10587
## age_group30 0.0277988 0.0366633 0.758 0.44853
## age_group40 -0.0009074 0.0275939 -0.033 0.97377
## age_group50 -0.0578543 0.0216672 -2.670 0.00773 **
## age_group60 -0.0499194 0.0178988 -2.789 0.00541 **
## age_group80 -0.0181927 0.0171416 -1.061 0.28884
## age_group90 -0.0111474 0.0214176 -0.520 0.60287
## death_typeunnatural 0.0973507 0.0207195 4.699 3.05e-06 ***
## genderFemale 0.0343766 0.0157970 2.176 0.02982 *
## anglonon_anglo -0.0610353 0.0161588 -3.777 0.00017 ***
## anglounknown -0.0790566 0.0187267 -4.222 2.69e-05 ***
## type_groupacademia/engineering -0.0315605 0.0432446 -0.730 0.46570
## type_groupgeneral fame -0.0102523 0.0271399 -0.378 0.70571
## type_groupknown for death -0.0213581 0.0217556 -0.982 0.32651
## type_groupleadership -0.0581051 0.0181453 -3.202 0.00141 **
## type_groupsports -0.0341565 0.0182585 -1.871 0.06173 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1692 on 852 degrees of freedom
## Multiple R-squared: 0.1234, Adjusted R-squared: 0.1059
## F-statistic: 7.056 on 17 and 852 DF, p-value: 3.001e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.97464 -0.40997 0.07498 0.57097 2.10379
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.67046 0.06666 40.061 < 2e-16 ***
## mean_before_relrank 0.94776 0.09955 9.521 < 2e-16 ***
## age_group20 0.71833 0.17951 4.002 6.84e-05 ***
## age_group30 0.64917 0.17658 3.676 0.000251 ***
## age_group40 0.35056 0.13278 2.640 0.008438 **
## age_group50 0.18143 0.10435 1.739 0.082470 .
## age_group60 0.13015 0.08611 1.511 0.131051
## age_group80 0.02061 0.08247 0.250 0.802727
## age_group90 0.03437 0.10319 0.333 0.739172
## death_typeunnatural 0.28194 0.09978 2.826 0.004829 **
## genderFemale -0.03354 0.07591 -0.442 0.658720
## anglonon_anglo -0.11627 0.07763 -1.498 0.134593
## anglounknown -0.32516 0.09079 -3.582 0.000361 ***
## type_groupacademia/engineering 0.33965 0.20822 1.631 0.103220
## type_groupgeneral fame 0.13243 0.13075 1.013 0.311418
## type_groupknown for death -0.08814 0.10562 -0.835 0.404208
## type_groupleadership 0.11290 0.08677 1.301 0.193592
## type_groupsports 0.07169 0.08830 0.812 0.417031
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8148 on 852 degrees of freedom
## Multiple R-squared: 0.1917, Adjusted R-squared: 0.1755
## F-statistic: 11.88 on 17 and 852 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.55367 -0.08110 -0.01437 0.05324 1.17998
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.095062 0.014816 6.416 2.32e-10 ***
## mean_before_relrank 0.086441 0.022127 3.907 0.000101 ***
## age_group20 0.192254 0.039899 4.818 1.71e-06 ***
## age_group30 0.118044 0.039248 3.008 0.002711 **
## age_group40 0.099524 0.029512 3.372 0.000779 ***
## age_group50 -0.024487 0.023195 -1.056 0.291393
## age_group60 -0.024520 0.019140 -1.281 0.200516
## age_group80 -0.012644 0.018330 -0.690 0.490506
## age_group90 -0.023678 0.022936 -1.032 0.302203
## death_typeunnatural 0.106270 0.022178 4.792 1.95e-06 ***
## genderFemale 0.005622 0.016873 0.333 0.739083
## anglonon_anglo -0.037016 0.017256 -2.145 0.032225 *
## anglounknown -0.080837 0.020179 -4.006 6.72e-05 ***
## type_groupacademia/engineering 0.023237 0.046282 0.502 0.615747
## type_groupgeneral fame -0.007989 0.029062 -0.275 0.783457
## type_groupknown for death 0.008185 0.023476 0.349 0.727436
## type_groupleadership -0.039897 0.019287 -2.069 0.038885 *
## type_groupsports -0.034255 0.019626 -1.745 0.081269 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1811 on 852 degrees of freedom
## Multiple R-squared: 0.1776, Adjusted R-squared: 0.1612
## F-statistic: 10.82 on 17 and 852 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62044 -0.19422 0.01128 0.20580 0.74237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.001595 0.020820 0.077 0.9390
## mean_before_relrank 0.292185 0.030757 9.500 < 2e-16 ***
## age_group20 0.054393 0.056290 0.966 0.3342
## age_group30 0.119792 0.055379 2.163 0.0308 *
## age_group40 -0.020965 0.041680 -0.503 0.6151
## age_group50 -0.029145 0.032728 -0.891 0.3734
## age_group60 -0.046225 0.027036 -1.710 0.0877 .
## age_group80 0.011330 0.025892 0.438 0.6618
## age_group90 0.070412 0.032351 2.177 0.0298 *
## death_typeunnatural 0.223393 0.031296 7.138 2.03e-12 ***
## genderFemale 0.032258 0.023861 1.352 0.1768
## anglonon_anglo -0.100770 0.024407 -4.129 4.01e-05 ***
## anglounknown -0.149597 0.028286 -5.289 1.57e-07 ***
## type_groupacademia/engineering 0.054180 0.065320 0.829 0.4071
## type_groupgeneral fame 0.025013 0.040994 0.610 0.5419
## type_groupknown for death -0.041843 0.032861 -1.273 0.2033
## type_groupleadership 0.027634 0.027408 1.008 0.3136
## type_groupsports 0.001862 0.027579 0.068 0.9462
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2555 on 852 degrees of freedom
## Multiple R-squared: 0.2343, Adjusted R-squared: 0.219
## F-statistic: 15.34 on 17 and 852 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60548 -0.21923 0.00579 0.23231 0.57350
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.069386 0.022386 3.099 0.002002 **
## mean_before_relrank -0.035601 0.033071 -1.076 0.282009
## age_group20 0.014011 0.060525 0.231 0.816994
## age_group30 -0.025187 0.059546 -0.423 0.672407
## age_group40 -0.043987 0.044816 -0.982 0.326618
## age_group50 -0.116348 0.035190 -3.306 0.000985 ***
## age_group60 -0.085145 0.029070 -2.929 0.003491 **
## age_group80 -0.009021 0.027840 -0.324 0.745990
## age_group90 0.039381 0.034785 1.132 0.257902
## death_typeunnatural 0.134966 0.033651 4.011 6.58e-05 ***
## genderFemale 0.037946 0.025656 1.479 0.139504
## anglonon_anglo -0.108631 0.026244 -4.139 3.83e-05 ***
## anglounknown -0.145758 0.030415 -4.792 1.94e-06 ***
## type_groupacademia/engineering -0.019779 0.070235 -0.282 0.778306
## type_groupgeneral fame -0.002759 0.044079 -0.063 0.950104
## type_groupknown for death -0.064889 0.035334 -1.836 0.066638 .
## type_groupleadership -0.071294 0.029470 -2.419 0.015763 *
## type_groupsports -0.042940 0.029654 -1.448 0.147977
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2748 on 852 degrees of freedom
## Multiple R-squared: 0.1148, Adjusted R-squared: 0.0971
## F-statistic: 6.498 on 17 and 852 DF, p-value: 1.155e-14
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71673 -0.18473 -0.00208 0.20710 0.61854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.033516 0.021153 -1.584 0.113459
## mean_before_relrank 0.336775 0.031589 10.661 < 2e-16 ***
## age_group20 0.224402 0.056963 3.939 8.84e-05 ***
## age_group30 0.216619 0.056034 3.866 0.000119 ***
## age_group40 0.099944 0.042134 2.372 0.017910 *
## age_group50 0.048430 0.033115 1.462 0.143973
## age_group60 0.038284 0.027326 1.401 0.161574
## age_group80 0.003695 0.026169 0.141 0.887740
## age_group90 0.006365 0.032745 0.194 0.845930
## death_typeunnatural 0.095068 0.031662 3.003 0.002756 **
## genderFemale -0.007467 0.024089 -0.310 0.756643
## anglonon_anglo -0.030620 0.024636 -1.243 0.214240
## anglounknown -0.107934 0.028809 -3.746 0.000191 ***
## type_groupacademia/engineering 0.120612 0.066075 1.825 0.068295 .
## type_groupgeneral fame 0.052868 0.041492 1.274 0.202944
## type_groupknown for death -0.026970 0.033516 -0.805 0.421209
## type_groupleadership 0.025037 0.027535 0.909 0.363470
## type_groupsports 0.026369 0.028019 0.941 0.346917
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2586 on 852 degrees of freedom
## Multiple R-squared: 0.2161, Adjusted R-squared: 0.2004
## F-statistic: 13.82 on 17 and 852 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.67278 -0.19342 0.02505 0.21969 0.60132
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.038607 0.022426 1.722 0.085511 .
## mean_before_relrank 0.118846 0.033490 3.549 0.000408 ***
## age_group20 0.111425 0.060390 1.845 0.065372 .
## age_group30 0.106071 0.059405 1.786 0.074526 .
## age_group40 0.060312 0.044669 1.350 0.177307
## age_group50 -0.060151 0.035107 -1.713 0.087008 .
## age_group60 -0.044402 0.028970 -1.533 0.125726
## age_group80 -0.015164 0.027744 -0.547 0.584804
## age_group90 -0.033081 0.034715 -0.953 0.340897
## death_typeunnatural 0.096865 0.033567 2.886 0.004004 **
## genderFemale 0.029224 0.025538 1.144 0.252806
## anglonon_anglo -0.031642 0.026118 -1.212 0.226027
## anglounknown -0.139591 0.030542 -4.570 5.59e-06 ***
## type_groupacademia/engineering 0.076049 0.070050 1.086 0.277952
## type_groupgeneral fame 0.006527 0.043988 0.148 0.882072
## type_groupknown for death -0.025012 0.035532 -0.704 0.481664
## type_groupleadership -0.102206 0.029192 -3.501 0.000487 ***
## type_groupsports -0.025534 0.029704 -0.860 0.390257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2741 on 852 degrees of freedom
## Multiple R-squared: 0.1189, Adjusted R-squared: 0.1013
## F-statistic: 6.765 on 17 and 852 DF, p-value: 2.014e-15
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3913 -0.4656 0.1301 0.5624 2.0282
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.79350 0.09382 8.458 < 2e-16 ***
## age_group70 2.31291 0.06392 36.184 < 2e-16 ***
## age_group20 2.70452 0.26667 10.142 < 2e-16 ***
## age_group30 2.62259 0.23921 10.964 < 2e-16 ***
## age_group40 2.22989 0.13600 16.397 < 2e-16 ***
## age_group50 2.28254 0.09768 23.368 < 2e-16 ***
## age_group60 2.21414 0.06950 31.859 < 2e-16 ***
## age_group80 2.34947 0.06286 37.377 < 2e-16 ***
## age_group90 2.50226 0.08461 29.574 < 2e-16 ***
## death_typeunnatural 0.93213 0.30056 3.101 0.00199 **
## genderFemale 0.08619 0.07254 1.188 0.23513
## anglonon_anglo -0.32918 0.07476 -4.403 1.20e-05 ***
## anglounknown -0.45736 0.08604 -5.316 1.36e-07 ***
## type_groupacademia/engineering 0.17904 0.19785 0.905 0.36577
## type_groupgeneral fame 0.05273 0.12475 0.423 0.67264
## type_groupknown for death -0.09979 0.09996 -0.998 0.31843
## type_groupleadership 0.15945 0.08318 1.917 0.05559 .
## type_groupsports 0.04115 0.08387 0.491 0.62376
## age_group20:death_typeunnatural -0.63094 0.43567 -1.448 0.14793
## age_group30:death_typeunnatural -0.14156 0.42659 -0.332 0.74009
## age_group40:death_typeunnatural -0.16313 0.37877 -0.431 0.66682
## age_group50:death_typeunnatural -0.44845 0.35775 -1.254 0.21037
## age_group60:death_typeunnatural -0.30088 0.35364 -0.851 0.39511
## age_group80:death_typeunnatural -0.47493 0.54408 -0.873 0.38296
## age_group90:death_typeunnatural -0.81402 0.83740 -0.972 0.33129
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7739 on 845 degrees of freedom
## Multiple R-squared: 0.9055, Adjusted R-squared: 0.9027
## F-statistic: 323.8 on 25 and 845 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92547 -0.07094 -0.02103 0.03979 1.15319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.025414 0.020412 1.245 0.213448
## age_group70 0.089665 0.013907 6.448 1.91e-10 ***
## age_group20 0.060730 0.058016 1.047 0.295500
## age_group30 0.055130 0.052042 1.059 0.289748
## age_group40 0.067078 0.029587 2.267 0.023633 *
## age_group50 0.053144 0.021251 2.501 0.012580 *
## age_group60 0.037372 0.015120 2.472 0.013643 *
## age_group80 0.070787 0.013675 5.176 2.83e-07 ***
## age_group90 0.078577 0.018407 4.269 2.19e-05 ***
## death_typeunnatural 0.054179 0.065388 0.829 0.407578
## genderFemale 0.035626 0.015782 2.257 0.024240 *
## anglonon_anglo -0.061710 0.016264 -3.794 0.000159 ***
## anglounknown -0.080713 0.018718 -4.312 1.81e-05 ***
## type_groupacademia/engineering -0.032227 0.043044 -0.749 0.454248
## type_groupgeneral fame -0.016796 0.027139 -0.619 0.536159
## type_groupknown for death -0.024095 0.021748 -1.108 0.268199
## type_groupleadership -0.057120 0.018097 -3.156 0.001654 **
## type_groupsports -0.032155 0.018246 -1.762 0.078382 .
## age_group20:death_typeunnatural 0.171955 0.094784 1.814 0.070003 .
## age_group30:death_typeunnatural 0.144053 0.092807 1.552 0.120995
## age_group40:death_typeunnatural 0.103868 0.082405 1.260 0.207855
## age_group50:death_typeunnatural -0.060255 0.077832 -0.774 0.439047
## age_group60:death_typeunnatural 0.053030 0.076937 0.689 0.490847
## age_group80:death_typeunnatural 0.009704 0.118368 0.082 0.934680
## age_group90:death_typeunnatural -0.144978 0.182183 -0.796 0.426381
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1684 on 845 degrees of freedom
## Multiple R-squared: 0.1994, Adjusted R-squared: 0.1757
## F-statistic: 8.418 on 25 and 845 DF, p-value: < 2.2e-16
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.94216 -0.41102 0.07389 0.57059 2.18694
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.94026 0.10036 9.369 < 2e-16 ***
## age_group70 2.66931 0.06775 39.397 < 2e-16 ***
## age_group20 3.47492 0.28167 12.337 < 2e-16 ***
## age_group30 3.29553 0.25261 13.046 < 2e-16 ***
## age_group40 2.93793 0.14373 20.441 < 2e-16 ***
## age_group50 2.87237 0.10330 27.806 < 2e-16 ***
## age_group60 2.80662 0.07339 38.240 < 2e-16 ***
## age_group80 2.70010 0.06692 40.350 < 2e-16 ***
## age_group90 2.70831 0.08941 30.292 < 2e-16 ***
## death_typeunnatural 0.37666 0.31740 1.187 0.235683
## genderFemale -0.03440 0.07644 -0.450 0.652773
## anglonon_anglo -0.12447 0.07878 -1.580 0.114517
## anglounknown -0.33028 0.09150 -3.610 0.000325 ***
## type_groupacademia/engineering 0.33479 0.20886 1.603 0.109313
## type_groupgeneral fame 0.11715 0.13179 0.889 0.374306
## type_groupknown for death -0.08548 0.10636 -0.804 0.421793
## type_groupleadership 0.11857 0.08718 1.360 0.174159
## type_groupsports 0.06552 0.08892 0.737 0.461390
## age_group20:death_typeunnatural -0.21410 0.46007 -0.465 0.641789
## age_group30:death_typeunnatural -0.04720 0.45020 -0.105 0.916533
## age_group40:death_typeunnatural 0.16610 0.39898 0.416 0.677281
## age_group50:death_typeunnatural -0.16983 0.37827 -0.449 0.653564
## age_group60:death_typeunnatural -0.12923 0.37346 -0.346 0.729391
## age_group80:death_typeunnatural -0.49657 0.57411 -0.865 0.387321
## age_group90:death_typeunnatural -0.24853 0.88473 -0.281 0.778846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8171 on 845 degrees of freedom
## Multiple R-squared: 0.9241, Adjusted R-squared: 0.9218
## F-statistic: 411.4 on 25 and 845 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64801 -0.07463 -0.01514 0.04831 1.08654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.083630 0.022064 3.790 0.000161 ***
## age_group70 0.094879 0.014896 6.369 3.11e-10 ***
## age_group20 0.079564 0.061926 1.285 0.199202
## age_group30 0.207745 0.055537 3.741 0.000196 ***
## age_group40 0.196045 0.031600 6.204 8.61e-10 ***
## age_group50 0.086549 0.022711 3.811 0.000148 ***
## age_group60 0.071046 0.016136 4.403 1.21e-05 ***
## age_group80 0.082052 0.014712 5.577 3.29e-08 ***
## age_group90 0.072244 0.019656 3.675 0.000253 ***
## death_typeunnatural 0.090447 0.069782 1.296 0.195279
## genderFemale 0.006957 0.016805 0.414 0.678970
## anglonon_anglo -0.036397 0.017321 -2.101 0.035909 *
## anglounknown -0.079850 0.020116 -3.969 7.82e-05 ***
## type_groupacademia/engineering 0.022898 0.045918 0.499 0.618139
## type_groupgeneral fame -0.011641 0.028974 -0.402 0.687945
## type_groupknown for death 0.006300 0.023383 0.269 0.787670
## type_groupleadership -0.040311 0.019166 -2.103 0.035737 *
## type_groupsports -0.028105 0.019549 -1.438 0.150895
## age_group20:death_typeunnatural 0.318038 0.101148 3.144 0.001723 **
## age_group30:death_typeunnatural 0.021538 0.098979 0.218 0.827789
## age_group40:death_typeunnatural 0.007598 0.087717 0.087 0.930992
## age_group50:death_typeunnatural -0.063518 0.083164 -0.764 0.445219
## age_group60:death_typeunnatural 0.003213 0.082106 0.039 0.968796
## age_group80:death_typeunnatural 0.009910 0.126221 0.079 0.937438
## age_group90:death_typeunnatural -0.113872 0.194512 -0.585 0.558420
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1796 on 845 degrees of freedom
## Multiple R-squared: 0.3217, Adjusted R-squared: 0.3016
## F-statistic: 16.03 on 25 and 845 DF, p-value: < 2.2e-16
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62147 -0.19550 0.01088 0.20087 0.73057
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.2876073 0.0310288 9.269 < 2e-16 ***
## age_group70 -0.0010522 0.0211400 -0.050 0.96031
## age_group20 0.1238746 0.0881930 1.405 0.16051
## age_group30 0.0812147 0.0791111 1.027 0.30491
## age_group40 -0.0379687 0.0449768 -0.844 0.39881
## age_group50 -0.0170764 0.0323044 -0.529 0.59722
## age_group60 -0.0432214 0.0229844 -1.880 0.06039 .
## age_group80 0.0149510 0.0207886 0.719 0.47222
## age_group90 0.0749740 0.0279819 2.679 0.00752 **
## death_typeunnatural 0.3175289 0.0993996 3.194 0.00145 **
## genderFemale 0.0340646 0.0239914 1.420 0.15602
## anglonon_anglo -0.1047908 0.0247235 -4.239 2.5e-05 ***
## anglounknown -0.1532241 0.0284546 -5.385 9.4e-08 ***
## type_groupacademia/engineering 0.0530173 0.0654330 0.810 0.41802
## type_groupgeneral fame 0.0191595 0.0412557 0.464 0.64247
## type_groupknown for death -0.0403775 0.0330598 -1.221 0.22229
## type_groupleadership 0.0299099 0.0275100 1.087 0.27724
## type_groupsports -0.0005107 0.0277366 -0.018 0.98531
## age_group20:death_typeunnatural -0.1915827 0.1440851 -1.330 0.18399
## age_group30:death_typeunnatural -0.0252587 0.1410809 -0.179 0.85795
## age_group40:death_typeunnatural -0.0343547 0.1252681 -0.274 0.78396
## age_group50:death_typeunnatural -0.1337441 0.1183163 -1.130 0.25863
## age_group60:death_typeunnatural -0.0985255 0.1169552 -0.842 0.39979
## age_group80:death_typeunnatural -0.1499235 0.1799365 -0.833 0.40497
## age_group90:death_typeunnatural -0.3572365 0.2769449 -1.290 0.19743
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2559 on 845 degrees of freedom
## Multiple R-squared: 0.2383, Adjusted R-squared: 0.2157
## F-statistic: 10.57 on 25 and 845 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63192 -0.21951 0.00156 0.23031 0.57993
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank -0.039227 0.033372 -1.175 0.240153
## age_group70 0.070331 0.022736 3.093 0.002045 **
## age_group20 0.096091 0.094853 1.013 0.311325
## age_group30 0.041532 0.085085 0.488 0.625588
## age_group40 0.014224 0.048373 0.294 0.768798
## age_group50 -0.025681 0.034744 -0.739 0.460022
## age_group60 -0.022031 0.024720 -0.891 0.373062
## age_group80 0.062061 0.022358 2.776 0.005630 **
## age_group90 0.112468 0.030095 3.737 0.000199 ***
## death_typeunnatural 0.147611 0.106906 1.381 0.167721
## genderFemale 0.038463 0.025803 1.491 0.136437
## anglonon_anglo -0.113644 0.026591 -4.274 2.14e-05 ***
## anglounknown -0.149934 0.030603 -4.899 1.15e-06 ***
## type_groupacademia/engineering -0.019377 0.070374 -0.275 0.783122
## type_groupgeneral fame -0.008856 0.044371 -0.200 0.841847
## type_groupknown for death -0.064415 0.035556 -1.812 0.070396 .
## type_groupleadership -0.070072 0.029587 -2.368 0.018094 *
## type_groupsports -0.044702 0.029831 -1.498 0.134378
## age_group20:death_typeunnatural -0.027778 0.154966 -0.179 0.857782
## age_group30:death_typeunnatural -0.004357 0.151735 -0.029 0.977101
## age_group40:death_typeunnatural 0.026451 0.134728 0.196 0.844400
## age_group50:death_typeunnatural -0.098592 0.127251 -0.775 0.438686
## age_group60:death_typeunnatural 0.059818 0.125787 0.476 0.634521
## age_group80:death_typeunnatural -0.009561 0.193525 -0.049 0.960608
## age_group90:death_typeunnatural -0.288201 0.297859 -0.968 0.333534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2753 on 845 degrees of freedom
## Multiple R-squared: 0.1189, Adjusted R-squared: 0.09283
## F-statistic: 4.561 on 25 and 845 DF, p-value: 2.333e-12
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.71524 -0.18611 -0.00282 0.20651 0.61822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.334205 0.031828 10.500 < 2e-16 ***
## age_group70 -0.033041 0.021488 -1.538 0.124511
## age_group20 0.237288 0.089330 2.656 0.008049 **
## age_group30 0.175763 0.080115 2.194 0.028516 *
## age_group40 0.035998 0.045584 0.790 0.429916
## age_group50 0.020332 0.032761 0.621 0.535032
## age_group60 0.005842 0.023277 0.251 0.801897
## age_group80 -0.026348 0.021223 -1.241 0.214778
## age_group90 -0.025258 0.028355 -0.891 0.373308
## death_typeunnatural 0.108666 0.100663 1.079 0.280675
## genderFemale -0.007697 0.024242 -0.318 0.750933
## anglonon_anglo -0.033544 0.024986 -1.342 0.179797
## anglounknown -0.109642 0.029019 -3.778 0.000169 ***
## type_groupacademia/engineering 0.118763 0.066238 1.793 0.073337 .
## type_groupgeneral fame 0.047796 0.041796 1.144 0.253129
## type_groupknown for death -0.026487 0.033731 -0.785 0.432540
## type_groupleadership 0.026762 0.027648 0.968 0.333343
## type_groupsports 0.023447 0.028200 0.831 0.405954
## age_group20:death_typeunnatural -0.078726 0.145910 -0.540 0.589650
## age_group30:death_typeunnatural 0.001729 0.142781 0.012 0.990343
## age_group40:death_typeunnatural 0.082247 0.126535 0.650 0.515874
## age_group50:death_typeunnatural -0.030965 0.119968 -0.258 0.796386
## age_group60:death_typeunnatural -0.014870 0.118442 -0.126 0.900118
## age_group80:death_typeunnatural -0.166931 0.182080 -0.917 0.359509
## age_group90:death_typeunnatural -0.125260 0.280592 -0.446 0.655414
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2591 on 845 degrees of freedom
## Multiple R-squared: 0.2191, Adjusted R-squared: 0.196
## F-statistic: 9.482 on 25 and 845 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70715 -0.19327 0.02603 0.21907 0.59020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank 0.116011 0.033760 3.436 0.000618 ***
## age_group70 0.038062 0.022793 1.670 0.095301 .
## age_group20 0.071226 0.094752 0.752 0.452437
## age_group30 0.179281 0.084978 2.110 0.035174 *
## age_group40 0.097324 0.048351 2.013 0.044444 *
## age_group50 -0.009159 0.034750 -0.264 0.792173
## age_group60 -0.008024 0.024690 -0.325 0.745276
## age_group80 0.023841 0.022511 1.059 0.289860
## age_group90 0.007153 0.030076 0.238 0.812081
## death_typeunnatural 0.122294 0.106773 1.145 0.252381
## genderFemale 0.029356 0.025713 1.142 0.253903
## anglonon_anglo -0.034279 0.026503 -1.293 0.196225
## anglounknown -0.141044 0.030780 -4.582 5.29e-06 ***
## type_groupacademia/engineering 0.076314 0.070259 1.086 0.277705
## type_groupgeneral fame 0.003185 0.044332 0.072 0.942744
## type_groupknown for death -0.024310 0.035778 -0.679 0.497031
## type_groupleadership -0.101816 0.029326 -3.472 0.000543 ***
## type_groupsports -0.023928 0.029912 -0.800 0.423974
## age_group20:death_typeunnatural 0.090805 0.154766 0.587 0.557546
## age_group30:death_typeunnatural -0.082225 0.151447 -0.543 0.587321
## age_group40:death_typeunnatural -0.019321 0.134215 -0.144 0.885570
## age_group50:death_typeunnatural -0.079483 0.127249 -0.625 0.532389
## age_group60:death_typeunnatural -0.001916 0.125630 -0.015 0.987838
## age_group80:death_typeunnatural -0.006991 0.193131 -0.036 0.971132
## age_group90:death_typeunnatural -0.144877 0.297622 -0.487 0.626539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2749 on 845 degrees of freedom
## Multiple R-squared: 0.1214, Adjusted R-squared: 0.0954
## F-statistic: 4.67 on 25 and 845 DF, p-value: 8.867e-13
for (ranks in c(FALSE, TRUE)) {
mods <- run_regression_analysis_T_vs_N(ranks)
print_regression_table_T_vs_N(mods, ranks)
print_model_description('NEWS vs. TWITTER', 'short-term boost', ranks)
print(summary(mods$mod_peak))
print_model_description('NEWS vs. TWITTER', 'long-term boost', ranks)
print(summary(mods$mod_perm))
}
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s",
## suffix, predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.65313 -0.36479 0.00554 0.34318 2.14224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.42705 0.04688 -9.109 < 2e-16 ***
## mean_before_relrank_diff -0.21235 0.08346 -2.544 0.01112 *
## age_group20 -0.57747 0.12582 -4.590 5.11e-06 ***
## age_group30 -0.23483 0.12379 -1.897 0.05816 .
## age_group40 -0.37445 0.09305 -4.024 6.22e-05 ***
## age_group50 -0.20387 0.07324 -2.783 0.00550 **
## age_group60 -0.17498 0.06057 -2.889 0.00396 **
## age_group80 0.01435 0.05778 0.248 0.80388
## age_group90 0.16419 0.07233 2.270 0.02346 *
## death_typeunnatural 0.34814 0.06995 4.977 7.82e-07 ***
## genderFemale 0.09071 0.05323 1.704 0.08870 .
## anglonon_anglo -0.21941 0.05437 -4.036 5.93e-05 ***
## anglounknown -0.05238 0.06307 -0.831 0.40646
## type_groupacademia/engineering -0.04838 0.14619 -0.331 0.74076
## type_groupgeneral fame -0.01644 0.09169 -0.179 0.85776
## type_groupknown for death 0.10452 0.07390 1.414 0.15760
## type_groupleadership 0.19975 0.06208 3.217 0.00134 **
## type_groupsports 0.05875 0.06191 0.949 0.34292
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5712 on 852 degrees of freedom
## Multiple R-squared: 0.1013, Adjusted R-squared: 0.08336
## F-statistic: 5.648 on 17 and 852 DF, p-value: 2.893e-12
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.25725 -0.05774 0.00741 0.06234 0.70460
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.014983 0.014259 -1.051 0.293665
## mean_before_relrank_diff -0.034104 0.025383 -1.344 0.179447
## age_group20 -0.134769 0.038268 -3.522 0.000452 ***
## age_group30 -0.089151 0.037649 -2.368 0.018110 *
## age_group40 -0.101234 0.028300 -3.577 0.000367 ***
## age_group50 -0.028959 0.022277 -1.300 0.193956
## age_group60 -0.020521 0.018421 -1.114 0.265602
## age_group80 -0.006351 0.017573 -0.361 0.717893
## age_group90 0.015400 0.021998 0.700 0.484072
## death_typeunnatural -0.008434 0.021275 -0.396 0.691877
## genderFemale 0.028585 0.016189 1.766 0.077796 .
## anglonon_anglo -0.023095 0.016535 -1.397 0.162868
## anglounknown 0.012408 0.019182 0.647 0.517899
## type_groupacademia/engineering -0.045961 0.044463 -1.034 0.301570
## type_groupgeneral fame 0.002200 0.027887 0.079 0.937150
## type_groupknown for death -0.015268 0.022475 -0.679 0.497109
## type_groupleadership -0.006173 0.018882 -0.327 0.743791
## type_groupsports 0.009053 0.018829 0.481 0.630790
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1737 on 852 degrees of freedom
## Multiple R-squared: 0.05249, Adjusted R-squared: 0.03358
## F-statistic: 2.776 on 17 and 852 DF, p-value: 0.0001535
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s",
## suffix, predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6496 -0.1208 0.0038 0.1106 0.7404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.007131 0.016294 0.438 0.661761
## mean_before_relrank_diff -0.077831 0.029007 -2.683 0.007434 **
## age_group20 -0.177328 0.043731 -4.055 5.47e-05 ***
## age_group30 -0.091639 0.043024 -2.130 0.033461 *
## age_group40 -0.112473 0.032340 -3.478 0.000531 ***
## age_group50 -0.058774 0.025457 -2.309 0.021195 *
## age_group60 -0.061181 0.021051 -2.906 0.003753 **
## age_group80 0.012762 0.020081 0.636 0.525260
## age_group90 0.072718 0.025139 2.893 0.003917 **
## death_typeunnatural 0.132598 0.024312 5.454 6.46e-08 ***
## genderFemale 0.030131 0.018500 1.629 0.103744
## anglonon_anglo -0.077795 0.018896 -4.117 4.21e-05 ***
## anglounknown -0.017957 0.021921 -0.819 0.412906
## type_groupacademia/engineering -0.026667 0.050811 -0.525 0.599843
## type_groupgeneral fame -0.011339 0.031869 -0.356 0.722071
## type_groupknown for death 0.028830 0.025684 1.123 0.261965
## type_groupleadership 0.060712 0.021578 2.814 0.005011 **
## type_groupsports 0.004572 0.021517 0.212 0.831794
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1985 on 852 degrees of freedom
## Multiple R-squared: 0.1042, Adjusted R-squared: 0.08629
## F-statistic: 5.827 on 17 and 852 DF, p-value: 9.067e-13
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92186 -0.12959 0.01391 0.14260 0.95349
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.008043 0.022742 0.354 0.72367
## mean_before_relrank_diff -0.230628 0.040485 -5.697 1.68e-08 ***
## age_group20 -0.105519 0.061035 -1.729 0.08420 .
## age_group30 -0.128012 0.060048 -2.132 0.03331 *
## age_group40 -0.106048 0.045137 -2.349 0.01903 *
## age_group50 -0.043230 0.035530 -1.217 0.22405
## age_group60 -0.026262 0.029381 -0.894 0.37165
## age_group80 0.004226 0.028027 0.151 0.88019
## age_group90 0.080720 0.035086 2.301 0.02165 *
## death_typeunnatural 0.039642 0.033933 1.168 0.24303
## genderFemale 0.007763 0.025820 0.301 0.76373
## anglonon_anglo -0.074869 0.026373 -2.839 0.00464 **
## anglounknown 0.023978 0.030594 0.784 0.43341
## type_groupacademia/engineering -0.069726 0.070916 -0.983 0.32578
## type_groupgeneral fame 0.003712 0.044479 0.083 0.93352
## type_groupknown for death 0.001229 0.035846 0.034 0.97266
## type_groupleadership 0.066667 0.030116 2.214 0.02711 *
## type_groupsports 0.008463 0.030031 0.282 0.77816
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2771 on 852 degrees of freedom
## Multiple R-squared: 0.0743, Adjusted R-squared: 0.05583
## F-statistic: 4.023 on 17 and 852 DF, p-value: 9.057e-08
for (ranks in c(FALSE, TRUE)) {
mods <- run_regression_analysis_T_vs_N_for_death_types(ranks)
# Short-term.
print_model_description('NEWS vs. TWITTER', 'short-term boost', ranks)
print(summary(mods$mod_peak))
# Long-term.
print_model_description('NEWS vs. TWITTER', 'long-term boost', ranks)
print(summary(mods$mod_perm))
# Plots.
par(mfrow=c(1,2))
plot_age_coeffs_for_death_types(mods$mod_peak, NA, 'peak_mean_boost_diff', BASE_AGE, ranks,
if (ranks) c(-0.45, 0.45) else c(-1.5, 0.5), 'black')
plot_age_coeffs_for_death_types(mods$mod_perm, NA, 'perm_boost_diff', BASE_AGE, ranks,
if (ranks) c(-0.45, 0.45) else c(-0.5, 0.5), 'black')
}
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s",
## suffix, predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.65119 -0.36422 0.00888 0.34055 2.14507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank_diff -0.21682 0.08400 -2.581 0.01001 *
## age_group70 -0.42927 0.04758 -9.022 < 2e-16 ***
## age_group20 -0.82078 0.19722 -4.162 3.48e-05 ***
## age_group30 -0.80494 0.17718 -4.543 6.35e-06 ***
## age_group40 -0.78266 0.10074 -7.769 2.28e-14 ***
## age_group50 -0.63262 0.07226 -8.755 < 2e-16 ***
## age_group60 -0.60094 0.05132 -11.711 < 2e-16 ***
## age_group80 -0.41640 0.04659 -8.938 < 2e-16 ***
## age_group90 -0.26085 0.06271 -4.160 3.51e-05 ***
## death_typeunnatural 0.40697 0.22256 1.829 0.06782 .
## genderFemale 0.09575 0.05356 1.788 0.07419 .
## anglonon_anglo -0.21719 0.05509 -3.942 8.74e-05 ***
## anglounknown -0.05533 0.06348 -0.872 0.38367
## type_groupacademia/engineering -0.04801 0.14652 -0.328 0.74326
## type_groupgeneral fame -0.01437 0.09234 -0.156 0.87634
## type_groupknown for death 0.10249 0.07432 1.379 0.16825
## type_groupleadership 0.19868 0.06232 3.188 0.00148 **
## type_groupsports 0.05601 0.06229 0.899 0.36880
## age_group20:death_typeunnatural -0.32913 0.32231 -1.021 0.30746
## age_group30:death_typeunnatural 0.17762 0.31605 0.562 0.57426
## age_group40:death_typeunnatural -0.11676 0.28006 -0.417 0.67686
## age_group50:death_typeunnatural -0.04974 0.26513 -0.188 0.85125
## age_group60:death_typeunnatural -0.06810 0.26168 -0.260 0.79474
## age_group80:death_typeunnatural 0.16940 0.40237 0.421 0.67385
## age_group90:death_typeunnatural -0.37706 0.61904 -0.609 0.54262
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5724 on 845 degrees of freedom
## Multiple R-squared: 0.4346, Adjusted R-squared: 0.4178
## F-statistic: 25.98 on 25 and 845 DF, p-value: < 2.2e-16
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: NONE
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20604 -0.05399 0.00862 0.05958 0.65987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank_diff -0.0387605 0.0254315 -1.524 0.12785
## age_group70 -0.0127546 0.0144053 -0.885 0.37618
## age_group20 -0.0266540 0.0597136 -0.446 0.65545
## age_group30 -0.1643295 0.0536460 -3.063 0.00226 **
## age_group40 -0.1367607 0.0305002 -4.484 8.34e-06 ***
## age_group50 -0.0394232 0.0218780 -1.802 0.07191 .
## age_group60 -0.0364275 0.0155368 -2.345 0.01928 *
## age_group80 -0.0201342 0.0141055 -1.427 0.15383
## age_group90 0.0010249 0.0189868 0.054 0.95697
## death_typeunnatural -0.0496287 0.0673856 -0.736 0.46164
## genderFemale 0.0285184 0.0162164 1.759 0.07900 .
## anglonon_anglo -0.0235607 0.0166806 -1.412 0.15818
## anglounknown 0.0105550 0.0192186 0.549 0.58301
## type_groupacademia/engineering -0.0463112 0.0443633 -1.044 0.29683
## type_groupgeneral fame 0.0001484 0.0279576 0.005 0.99577
## type_groupknown for death -0.0162957 0.0225018 -0.724 0.46915
## type_groupleadership -0.0049896 0.0188673 -0.264 0.79149
## type_groupsports 0.0050388 0.0188582 0.267 0.78939
## age_group20:death_typeunnatural -0.1369124 0.0975852 -1.403 0.16098
## age_group30:death_typeunnatural 0.1441439 0.0956893 1.506 0.13234
## age_group40:death_typeunnatural 0.1064914 0.0847944 1.256 0.20951
## age_group50:death_typeunnatural 0.0275709 0.0802742 0.343 0.73134
## age_group60:death_typeunnatural 0.0599112 0.0792287 0.756 0.44975
## age_group80:death_typeunnatural 0.0063933 0.1218248 0.052 0.95816
## age_group90:death_typeunnatural 0.0029407 0.1874274 0.016 0.98749
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1733 on 845 degrees of freedom
## Multiple R-squared: 0.101, Adjusted R-squared: 0.07445
## F-statistic: 3.799 on 25 and 845 DF, p-value: 1.803e-09
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: short-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("peak_mean_boost%s_diff ~ %s",
## suffix, predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64846 -0.11978 0.00327 0.10887 0.74189
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank_diff -0.079660 0.029192 -2.729 0.006487 **
## age_group70 0.005902 0.016535 0.357 0.721241
## age_group20 -0.131089 0.068543 -1.913 0.056148 .
## age_group30 -0.142074 0.061578 -2.307 0.021283 *
## age_group40 -0.100671 0.035010 -2.876 0.004135 **
## age_group50 -0.052507 0.025113 -2.091 0.036838 *
## age_group60 -0.051830 0.017834 -2.906 0.003754 **
## age_group80 0.018036 0.016191 1.114 0.265611
## age_group90 0.080562 0.021794 3.696 0.000233 ***
## death_typeunnatural 0.155420 0.077349 2.009 0.044819 *
## genderFemale 0.032534 0.018614 1.748 0.080860 .
## anglonon_anglo -0.076134 0.019147 -3.976 7.6e-05 ***
## anglounknown -0.018445 0.022060 -0.836 0.403316
## type_groupacademia/engineering -0.026851 0.050923 -0.527 0.598128
## type_groupgeneral fame -0.010738 0.032091 -0.335 0.738007
## type_groupknown for death 0.027646 0.025829 1.070 0.284766
## type_groupleadership 0.060264 0.021657 2.783 0.005512 **
## type_groupsports 0.004710 0.021646 0.218 0.827786
## age_group20:death_typeunnatural -0.081471 0.112014 -0.727 0.467224
## age_group30:death_typeunnatural 0.071281 0.109837 0.649 0.516537
## age_group40:death_typeunnatural -0.038913 0.097332 -0.400 0.689404
## age_group50:death_typeunnatural -0.020927 0.092143 -0.227 0.820392
## age_group60:death_typeunnatural -0.046500 0.090943 -0.511 0.609270
## age_group80:death_typeunnatural 0.071107 0.139837 0.508 0.611238
## age_group90:death_typeunnatural -0.166528 0.215139 -0.774 0.439120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1989 on 845 degrees of freedom
## Multiple R-squared: 0.1079, Adjusted R-squared: 0.08154
## F-statistic: 4.089 on 25 and 845 DF, p-value: 1.466e-10
##
##
##
## #####################################################
## REGRESSION MODEL
## Medium: NEWS vs. TWITTER
## Dependent variable: long-term boost
## Transformation on dependent variable: RELATIVE RANKS
## #####################################################
##
## Call:
## lm(formula = as.formula(sprintf("perm_boost%s_diff ~ %s", suffix,
## predictors)), data = lmdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.92420 -0.13055 0.01538 0.14045 0.95719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## mean_before_relrank_diff -0.2341465 0.0407524 -5.746 1.28e-08 ***
## age_group70 0.0106514 0.0230835 0.461 0.64461
## age_group20 0.0031530 0.0956873 0.033 0.97372
## age_group30 -0.1718587 0.0859643 -1.999 0.04591 *
## age_group40 -0.1054249 0.0488746 -2.157 0.03128 *
## age_group50 -0.0333427 0.0350581 -0.951 0.34184
## age_group60 -0.0213921 0.0248968 -0.859 0.39046
## age_group80 0.0133539 0.0226032 0.591 0.55481
## age_group90 0.0899990 0.0304251 2.958 0.00318 **
## death_typeunnatural -0.0135298 0.1079812 -0.125 0.90032
## genderFemale 0.0080192 0.0259857 0.309 0.75770
## anglonon_anglo -0.0751648 0.0267296 -2.812 0.00504 **
## anglounknown 0.0227608 0.0307967 0.739 0.46007
## type_groupacademia/engineering -0.0697905 0.0710894 -0.982 0.32651
## type_groupgeneral fame 0.0031289 0.0448002 0.070 0.94434
## type_groupknown for death -0.0002293 0.0360578 -0.006 0.99493
## type_groupleadership 0.0668489 0.0302337 2.211 0.02730 *
## type_groupsports 0.0050713 0.0302191 0.168 0.86677
## age_group20:death_typeunnatural -0.0923192 0.1563740 -0.590 0.55510
## age_group30:death_typeunnatural 0.1416421 0.1533360 0.924 0.35589
## age_group40:death_typeunnatural 0.0782650 0.1358777 0.576 0.56477
## age_group50:death_typeunnatural 0.0504094 0.1286343 0.392 0.69524
## age_group60:death_typeunnatural 0.0908410 0.1269589 0.716 0.47449
## age_group80:death_typeunnatural 0.0187045 0.1952165 0.096 0.92369
## age_group90:death_typeunnatural -0.0496572 0.3003405 -0.165 0.86872
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2777 on 845 degrees of freedom
## Multiple R-squared: 0.07775, Adjusted R-squared: 0.05046
## F-statistic: 2.849 on 25 and 845 DF, p-value: 4.86e-06
regdata <- load_doc_length_regression_data()
# Only constant, i.e., estimate mean (plus standard errors).
plot_doc_length_increase(regdata, '1', 'Direct estimate')
# Adjust for those predictors that were found to have a significant impact on short-term
# post-mortem mention frequency in the news.
# (All possible predictors: mean_before_relrank, age_group, death_type, gender, anglo, type_group.)
plot_doc_length_increase(regdata, 'mean_before_relrank + age_group + death_type + anglo', 'Adjusted estimate')
Figure 12: Relative length increase of documents that mention deceased public figures, with respect to the respective public figure’s pre-mortem mean document length, as a function of days since death. All means in this analysis are geometric means. Error bars are 95% confidence intervals approximated as \(\pm 2\) standard errors. Left: Direct mean estimates of relative length increase. Right: Estimates adjusted for population drift (since certain groups of people are more likely to be mentioned post-mortem than others). Each day \(t\)’s estimate was obtained from a separate linear regression model for day \(t\) only, which included each person \(i\) mentioned that day as a data point, with \(\log(L_{it}/P_i)\) as the dependent variable (where \(L_{it}\) is \(i\)’s mean document length on day \(t\), and \(P_i\) is \(i\)’s pre-mortem mean document length), and with independent variables defined by the predictors that were previously found to be significantly associated with short-term post-mortem mention frequency (language, pre-mortem mean mention frequency rank, manner of death, age group), such that estimates are for anglophones of median pre-mortem popularity who died an unnatural death at age 70-79 years. The adjusted estimate of the (geometric) mean relative length increase for day \(t\) is given by \(e^{a_t}-1\), where \(a_t\) is the intercept of the regression for day \(t\). We see that, both with and without adjustment, the documents that mentioned a person on the day of their death were on average about 40% shorter than documents that mentioned the person before their death, and that the pre-mortem level is reached again after about one month.